From 571d91dcadfa3cef499010b4eddb9b58b0da4d24 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 Oct 2023 13:16:19 -0700 Subject: perf: Add branch stack counters Currently, the additional information of a branch entry is stored in a u64 space. With more and more information added, the space is running out. For example, the information of occurrences of events will be added for each branch. Two places were suggested to append the counters. https://lore.kernel.org/lkml/20230802215814.GH231007@hirez.programming.kicks-ass.net/ One place is right after the flags of each branch entry. It changes the existing struct perf_branch_entry. The later ARCH specific implementation has to be really careful to consistently pick the right struct. The other place is right after the entire struct perf_branch_stack. The disadvantage is that the pointer of the extra space has to be recorded. The common interface perf_sample_save_brstack() has to be updated. The latter is much straightforward, and should be easily understood and maintained. It is implemented in the patch. Add a new branch sample type, PERF_SAMPLE_BRANCH_COUNTERS, to indicate the event which is recorded in the branch info. The "u64 counters" may store the occurrences of several events. The information regarding the number of events/counters and the width of each counter should be exposed via sysfs as a reference for the perf tool. Define the branch_counter_nr and branch_counter_width ABI here. The support will be implemented later in the Intel-specific patch. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231025201626.3000228-1-kan.liang@linux.intel.com --- include/linux/perf_event.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0367d748fae0e..7897ef0660272 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1139,6 +1139,10 @@ static inline bool branch_sample_priv(const struct perf_event *event) return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE; } +static inline bool branch_sample_counters(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS; +} struct perf_sample_data { /* @@ -1173,6 +1177,7 @@ struct perf_sample_data { struct perf_callchain_entry *callchain; struct perf_raw_record *raw; struct perf_branch_stack *br_stack; + u64 *br_stack_cntr; union perf_sample_weight weight; union perf_mem_data_src data_src; u64 txn; @@ -1250,7 +1255,8 @@ static inline void perf_sample_save_raw_data(struct perf_sample_data *data, static inline void perf_sample_save_brstack(struct perf_sample_data *data, struct perf_event *event, - struct perf_branch_stack *brs) + struct perf_branch_stack *brs, + u64 *brs_cntr) { int size = sizeof(u64); /* nr */ @@ -1258,7 +1264,16 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data, size += sizeof(u64); size += brs->nr * sizeof(struct perf_branch_entry); + /* + * The extension space for counters is appended after the + * struct perf_branch_stack. It is used to store the occurrences + * of events of each branch. + */ + if (brs_cntr) + size += brs->nr * sizeof(u64); + data->br_stack = brs; + data->br_stack_cntr = brs_cntr; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } -- cgit v1.2.3 From 1f2376cd03dd3b965d130ed46a7c92769d614ba1 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 25 Oct 2023 13:16:21 -0700 Subject: perf: Add branch_sample_call_stack Add a helper function to check call stack sample type. The later patch will invoke the function in several places. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231025201626.3000228-3-kan.liang@linux.intel.com --- include/linux/perf_event.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 7897ef0660272..ac1a59c1f2522 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1144,6 +1144,11 @@ static inline bool branch_sample_counters(const struct perf_event *event) return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS; } +static inline bool branch_sample_call_stack(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK; +} + struct perf_sample_data { /* * Fields set by perf_sample_data_init() unconditionally, -- cgit v1.2.3 From d23b5c577715892c87533b13923306acc6243f93 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 29 Oct 2023 06:14:29 +0000 Subject: cgroup: Make operations on the cgroup root_list RCU safe At present, when we perform operations on the cgroup root_list, we must hold the cgroup_mutex, which is a relatively heavyweight lock. In reality, we can make operations on this list RCU-safe, eliminating the need to hold the cgroup_mutex during traversal. Modifications to the list only occur in the cgroup root setup and destroy paths, which should be infrequent in a production environment. In contrast, traversal may occur frequently. Therefore, making it RCU-safe would be beneficial. Signed-off-by: Yafang Shao Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4a6b6b77ccb6c..4caab0c6b3611 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -563,6 +563,7 @@ struct cgroup_root { /* A list running through the active hierarchies */ struct list_head root_list; + struct rcu_head rcu; /* Hierarchy-specific flags */ unsigned int flags; -- cgit v1.2.3 From aecd408b7e50742868b3305c24325a89024e2a30 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 29 Oct 2023 06:14:32 +0000 Subject: cgroup: Add a new helper for cgroup1 hierarchy A new helper is added for cgroup1 hierarchy: - task_get_cgroup1 Acquires the associated cgroup of a task within a specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its hierarchy ID. This helper function is added to facilitate the tracing of tasks within a particular container or cgroup dir in BPF programs. It's important to note that this helper is designed specifically for cgroup1 only. tj: Use irsqsave/restore as suggested by Hou Tao . Suggested-by: Tejun Heo Signed-off-by: Yafang Shao Cc: Hou Tao Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 0ef0af66080ed..34aaf0e87def8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -69,6 +69,7 @@ struct css_task_iter { extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; +extern spinlock_t css_set_lock; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; #include @@ -386,7 +387,6 @@ static inline void cgroup_unlock(void) * as locks used during the cgroup_subsys::attach() methods. */ #ifdef CONFIG_PROVE_RCU -extern spinlock_t css_set_lock; #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ rcu_read_lock_sched_held() || \ @@ -853,4 +853,6 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {} #endif /* CONFIG_CGROUP_BPF */ +struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id); + #endif /* _LINUX_CGROUP_H */ -- cgit v1.2.3 From 74523c06ae20b83c5508a98af62393ac34913362 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 6 Nov 2023 20:57:23 -0800 Subject: bpf: Add __bpf_dynptr_data* for in kernel use Different types of bpf dynptr have different internal data storage. Specifically, SKB and XDP type of dynptr may have non-continuous data. Therefore, it is not always safe to directly access dynptr->data. Add __bpf_dynptr_data and __bpf_dynptr_data_rw to replace direct access to dynptr->data. Update bpf_verify_pkcs7_signature to use __bpf_dynptr_data instead of dynptr->data. Signed-off-by: Song Liu Signed-off-by: Andrii Nakryiko Acked-by: Vadim Fedorenko Link: https://lore.kernel.org/bpf/20231107045725.2278852-2-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b4825d3cdb292..eb84caf133df9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1222,6 +1222,8 @@ enum bpf_dynptr_type { int bpf_dynptr_check_size(u32 size); u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); +const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len); +void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len); #ifdef CONFIG_BPF_JIT int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); -- cgit v1.2.3 From 790ce3cfefb1b768dccd4eee324ddef0f0ce3db4 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 7 Nov 2023 00:56:37 -0800 Subject: bpf: Move GRAPH_{ROOT,NODE}_MASK macros into btf_field_type enum This refactoring patch removes the unused BPF_GRAPH_NODE_OR_ROOT btf_field_type and moves BPF_GRAPH_{NODE,ROOT} macros into the btf_field_type enum. Further patches in the series will use BPF_GRAPH_NODE, so let's move this useful definition out of btf.c. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20231107085639.3016113-5-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eb84caf133df9..4001d11be1516 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -186,8 +186,8 @@ enum btf_field_type { BPF_LIST_NODE = (1 << 6), BPF_RB_ROOT = (1 << 7), BPF_RB_NODE = (1 << 8), - BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD | - BPF_RB_NODE | BPF_RB_ROOT, + BPF_GRAPH_NODE = BPF_RB_NODE | BPF_LIST_NODE, + BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD, BPF_REFCOUNT = (1 << 9), }; -- cgit v1.2.3 From 689b097a06bafb461ec162fc3b3ecc9765cea67b Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 6 Nov 2023 03:18:02 +0000 Subject: compiler-gcc: Suppress -Wmissing-prototypes warning for all supported GCC The kernel supports a minimum GCC version of 5.1.0 for building. However, the "__diag_ignore_all" directive only suppresses the "-Wmissing-prototypes" warning for GCC versions >= 8.0.0. As a result, when building the kernel with older GCC versions, warnings may be triggered. The example below illustrates the warnings reported by the kernel test robot using GCC 7.5.0: compiler: gcc-7 (Ubuntu 7.5.0-6ubuntu2) 7.5.0 All warnings (new ones prefixed by >>): kernel/bpf/helpers.c:1893:19: warning: no previous prototype for 'bpf_obj_new_impl' [-Wmissing-prototypes] __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) ^~~~~~~~~~~~~~~~ kernel/bpf/helpers.c:1907:19: warning: no previous prototype for 'bpf_percpu_obj_new_impl' [-Wmissing-prototypes] __bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign) [...] To address this, we should also suppress the "-Wmissing-prototypes" warning for older GCC versions. "#pragma GCC diagnostic push" is supported as of GCC 4.6, and both "-Wmissing-prototypes" and "-Wmissing-declarations" are supported for all the GCC versions that we currently support. Therefore, it is reasonable to suppress these warnings for all supported GCC versions. With this adjustment, it's important to note that after implementing "__diag_ignore_all", it will effectively suppress warnings for all the supported GCC versions. In the future, if you wish to suppress warnings that are only supported on higher GCC versions, it is advisable to explicitly use "__diag_ignore" to specify the GCC version you are targeting. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311031651.A7crZEur-lkp@intel.com/ Suggested-by: Arnd Bergmann Signed-off-by: Yafang Shao Cc: Kumar Kartikeya Dwivedi Cc: Arnd Bergmann Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20231106031802.4188-1-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/compiler-gcc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 2ceba3fe4ec16..aebb65bf95a79 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -136,7 +136,7 @@ #endif #define __diag_ignore_all(option, comment) \ - __diag_GCC(8, ignore, option) + __diag(__diag_GCC_ignore option) /* * Prior to 9.1, -Wno-alloc-size-larger-than (and therefore the "alloc_size" -- cgit v1.2.3 From fe28f631fa941fba583d1c4f25895284b90af671 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 25 Oct 2023 14:25:52 -0400 Subject: workqueue: Add workqueue_unbound_exclude_cpumask() to exclude CPUs from wq_unbound_cpumask When the "isolcpus" boot command line option is used to add a set of isolated CPUs, those CPUs will be excluded automatically from wq_unbound_cpumask to avoid running work functions from unbound workqueues. Recently cpuset has been extended to allow the creation of partitions of isolated CPUs dynamically. To make it closer to the "isolcpus" in functionality, the CPUs in those isolated cpuset partitions should be excluded from wq_unbound_cpumask as well. This can be done currently by explicitly writing to the workqueue's cpumask sysfs file after creating the isolated partitions. However, this process can be error prone. Ideally, the cpuset code should be allowed to request the workqueue code to exclude those isolated CPUs from wq_unbound_cpumask so that this operation can be done automatically and the isolated CPUs will be returned back to wq_unbound_cpumask after the destructions of the isolated cpuset partitions. This patch adds a new workqueue_unbound_exclude_cpumask() function to enable that. This new function will exclude the specified isolated CPUs from wq_unbound_cpumask. To be able to restore those isolated CPUs back after the destruction of isolated cpuset partitions, a new wq_requested_unbound_cpumask is added to store the user provided unbound cpumask either from the boot command line options or from writing to the cpumask sysfs file. This new cpumask provides the basis for CPU exclusion. To enable users to understand how the wq_unbound_cpumask is being modified internally, this patch also exposes the newly introduced wq_requested_unbound_cpumask as well as a wq_isolated_cpumask to store the cpumask to be excluded from wq_unbound_cpumask as read-only sysfs files. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 24b1e5070f4d4..b0b9604b76b88 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -491,7 +491,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(void); void free_workqueue_attrs(struct workqueue_attrs *attrs); int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs); -int workqueue_set_unbound_cpumask(cpumask_var_t cpumask); +extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); -- cgit v1.2.3 From e76d28bdf9ba5388b8c4835a5199dc427b603188 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 3 Nov 2023 23:13:01 -0400 Subject: cgroup/rstat: Reduce cpu_lock hold time in cgroup_rstat_flush_locked() When cgroup_rstat_updated() isn't being called concurrently with cgroup_rstat_flush_locked(), its run time is pretty short. When both are called concurrently, the cgroup_rstat_updated() run time can spike to a pretty high value due to high cpu_lock hold time in cgroup_rstat_flush_locked(). This can be problematic if the task calling cgroup_rstat_updated() is a realtime task running on an isolated CPU with a strict latency requirement. The cgroup_rstat_updated() call can happen when there is a page fault even though the task is running in user space most of the time. The percpu cpu_lock is used to protect the update tree - updated_next and updated_children. This protection is only needed when cgroup_rstat_cpu_pop_updated() is being called. The subsequent flushing operation which can take a much longer time does not need that protection as it is already protected by cgroup_rstat_lock. To reduce the cpu_lock hold time, we need to perform all the cgroup_rstat_cpu_pop_updated() calls up front with the lock released afterward before doing any flushing. This patch adds a new cgroup_rstat_updated_list() function to return a singly linked list of cgroups to be flushed. Some instrumentation code are added to measure the cpu_lock hold time right after lock acquisition to after releasing the lock. Parallel kernel build on a 2-socket x86-64 server is used as the benchmarking tool for measuring the lock hold time. The maximum cpu_lock hold time before and after the patch are 100us and 29us respectively. So the worst case time is reduced to about 30% of the original. However, there may be some OS or hardware noises like NMI or SMI in the test system that can worsen the worst case value. Those noises are usually tuned out in a real production environment to get a better result. OTOH, the lock hold time frequency distribution should give a better idea of the performance benefit of the patch. Below were the frequency distribution before and after the patch: Hold time Before patch After patch --------- ------------ ----------- 0-01 us 804,139 13,738,708 01-05 us 9,772,767 1,177,194 05-10 us 4,595,028 4,984 10-15 us 303,481 3,562 15-20 us 78,971 1,314 20-25 us 24,583 18 25-30 us 6,908 12 30-40 us 8,015 40-50 us 2,192 50-60 us 316 60-70 us 43 70-80 us 7 80-90 us 2 >90 us 3 Signed-off-by: Waiman Long Reviewed-by: Yosry Ahmed Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4caab0c6b3611..37518436cfe7f 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -496,6 +496,13 @@ struct cgroup { struct cgroup_rstat_cpu __percpu *rstat_cpu; struct list_head rstat_css_list; + /* + * A singly-linked list of cgroup structures to be rstat flushed. + * This is a scratch field to be used exclusively by + * cgroup_rstat_flush_locked() and protected by cgroup_rstat_lock. + */ + struct cgroup *rstat_flush_next; + /* cgroup basic resource statistics */ struct cgroup_base_stat last_bstat; struct cgroup_base_stat bstat; -- cgit v1.2.3 From 8156c7dd47b92fc4a70c9ea58e7a9e88c8bc32be Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 26 Oct 2023 16:48:21 +0200 Subject: regulator: Introduce handling for system-critical under-voltage events Handle under-voltage events for crucial regulators to maintain system stability and avoid issues during power drops. Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20231026144824.4065145-3-o.rempel@pengutronix.de Signed-off-by: Mark Brown --- include/linux/regulator/machine.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h index 621b7f4a36395..e0ddfb5593c92 100644 --- a/include/linux/regulator/machine.h +++ b/include/linux/regulator/machine.h @@ -49,6 +49,13 @@ struct regulator; #define DISABLE_IN_SUSPEND 1 #define ENABLE_IN_SUSPEND 2 +/* + * Default time window (in milliseconds) following a critical under-voltage + * event during which less critical actions can be safely carried out by the + * system. + */ +#define REGULATOR_DEF_UV_LESS_CRITICAL_WINDOW_MS 10 + /* Regulator active discharge flags */ enum regulator_active_discharge { REGULATOR_ACTIVE_DISCHARGE_DEFAULT, @@ -127,6 +134,8 @@ struct notification_limit { * @ramp_disable: Disable ramp delay when initialising or when setting voltage. * @soft_start: Enable soft start so that voltage ramps slowly. * @pull_down: Enable pull down when regulator is disabled. + * @system_critical: Set if the regulator is critical to system stability or + * functionality. * @over_current_protection: Auto disable on over current event. * * @over_current_detection: Configure over current limits. @@ -214,6 +223,7 @@ struct regulation_constraints { unsigned ramp_disable:1; /* disable ramp delay */ unsigned soft_start:1; /* ramp voltage slowly */ unsigned pull_down:1; /* pull down resistor when regulator off */ + unsigned system_critical:1; /* critical to system stability */ unsigned over_current_protection:1; /* auto disable on over current */ unsigned over_current_detection:1; /* notify on over current */ unsigned over_voltage_detection:1; /* notify on over voltage */ -- cgit v1.2.3 From 1e22152aa59d793743fc53051dd7a042f362aecb Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 26 Oct 2023 16:48:24 +0200 Subject: regulator: Implement uv_survival_time for handling under-voltage events Add 'uv_survival_time' field to regulation_constraints for specifying survival time post critical under-voltage event. Update the regulator notifier call chain and Device Tree property parsing to use this new field, allowing a configurable timeout before emergency shutdown. Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20231026144824.4065145-6-o.rempel@pengutronix.de Signed-off-by: Mark Brown --- include/linux/regulator/machine.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h index e0ddfb5593c92..0cd76d2647274 100644 --- a/include/linux/regulator/machine.h +++ b/include/linux/regulator/machine.h @@ -162,6 +162,13 @@ struct notification_limit { * regulator_active_discharge values are used for * initialisation. * @enable_time: Turn-on time of the rails (unit: microseconds) + * @uv_less_critical_window_ms: Specifies the time window (in milliseconds) + * following a critical under-voltage (UV) event + * during which less critical actions can be + * safely carried out by the system (for example + * logging). After this time window more critical + * actions should be done (for example prevent + * HW damage). */ struct regulation_constraints { @@ -213,6 +220,7 @@ struct regulation_constraints { unsigned int settling_time_up; unsigned int settling_time_down; unsigned int enable_time; + unsigned int uv_less_critical_window_ms; unsigned int active_discharge; -- cgit v1.2.3 From f3b8788cde61b02f1e6c202f8fac4360e6adbafc Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:46 -0700 Subject: LSM: Identify modules by more than name Create a struct lsm_id to contain identifying information about Linux Security Modules (LSMs). At inception this contains the name of the module and an identifier associated with the security module. Change the security_add_hooks() interface to use this structure. Change the individual modules to maintain their own struct lsm_id and pass it to security_add_hooks(). The values are for LSM identifiers are defined in a new UAPI header file linux/lsm.h. Each existing LSM has been updated to include it's LSMID in the lsm_id. The LSM ID values are sequential, with the oldest module LSM_ID_CAPABILITY being the lowest value and the existing modules numbered in the order they were included in the main line kernel. This is an arbitrary convention for assigning the values, but none better presents itself. The value 0 is defined as being invalid. The values 1-99 are reserved for any special case uses which may arise in the future. This may include attributes of the LSM infrastructure itself, possibly related to namespacing or network attribute management. A special range is identified for such attributes to help reduce confusion for developers unfamiliar with LSMs. LSM attribute values are defined for the attributes presented by modules that are available today. As with the LSM IDs, The value 0 is defined as being invalid. The values 1-99 are reserved for any special case uses which may arise in the future. Cc: linux-security-module Signed-off-by: Casey Schaufler Reviewed-by: Kees Cook Reviewed-by: Serge Hallyn Reviewed-by: Mickael Salaun Reviewed-by: John Johansen Signed-off-by: Kees Cook Nacked-by: Tetsuo Handa [PM: forward ported beyond v6.6 due merge window changes] Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index dcb5e5b5eb135..7f0adb33caaae 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -42,6 +42,18 @@ struct security_hook_heads { #undef LSM_HOOK } __randomize_layout; +/** + * struct lsm_id - Identify a Linux Security Module. + * @lsm: name of the LSM, must be approved by the LSM maintainers + * @id: LSM ID number from uapi/linux/lsm.h + * + * Contains the information that identifies the LSM. + */ +struct lsm_id { + const char *name; + u64 id; +}; + /* * Security module hook list structure. * For use with generic list macros for common operations. @@ -50,7 +62,7 @@ struct security_hook_list { struct hlist_node list; struct hlist_head *head; union security_list_options hook; - const char *lsm; + const struct lsm_id *lsmid; } __randomize_layout; /* @@ -104,7 +116,7 @@ extern struct security_hook_heads security_hook_heads; extern char *lsm_names; extern void security_add_hooks(struct security_hook_list *hooks, int count, - const char *lsm); + const struct lsm_id *lsmid); #define LSM_FLAG_LEGACY_MAJOR BIT(0) #define LSM_FLAG_EXCLUSIVE BIT(1) -- cgit v1.2.3 From 9285c5ad9d00abfe0f4e2ce4039c8127e7a09738 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:47 -0700 Subject: LSM: Maintain a table of LSM attribute data As LSMs are registered add their lsm_id pointers to a table. This will be used later for attribute reporting. Determine the number of possible security modules based on their respective CONFIG options. This allows the number to be known at build time. This allows data structures and tables to use the constant. Signed-off-by: Casey Schaufler Reviewed-by: Kees Cook Reviewed-by: Serge Hallyn Reviewed-by: Mickael Salaun Reviewed-by: John Johansen Signed-off-by: Paul Moore --- include/linux/security.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 1d1df326c881c..50c178019a58b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -138,6 +138,8 @@ enum lockdown_reason { }; extern const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1]; +extern u32 lsm_active_cnt; +extern const struct lsm_id *lsm_idlist[]; /* These functions are in security/commoncap.c */ extern int cap_capable(const struct cred *cred, struct user_namespace *ns, -- cgit v1.2.3 From 267c068e5f8b81b68cc4247c94dbba90a21a634e Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:48 -0700 Subject: proc: Use lsmids instead of lsm names for attrs Use the LSM ID number instead of the LSM name to identify which security module's attibute data should be shown in /proc/self/attr. The security_[gs]etprocattr() functions have been changed to expect the LSM ID. The change from a string comparison to an integer comparison in these functions will provide a minor performance improvement. Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Casey Schaufler Reviewed-by: Kees Cook Reviewed-by: Serge Hallyn Reviewed-by: Mickael Salaun Reviewed-by: John Johansen Signed-off-by: Paul Moore --- include/linux/security.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 50c178019a58b..c81bca77f4f21 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -472,10 +472,9 @@ int security_sem_semctl(struct kern_ipc_perm *sma, int cmd); int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops, unsigned nsops, int alter); void security_d_instantiate(struct dentry *dentry, struct inode *inode); -int security_getprocattr(struct task_struct *p, const char *lsm, const char *name, +int security_getprocattr(struct task_struct *p, int lsmid, const char *name, char **value); -int security_setprocattr(const char *lsm, const char *name, void *value, - size_t size); +int security_setprocattr(int lsmid, const char *name, void *value, size_t size); int security_netlink_send(struct sock *sk, struct sk_buff *skb); int security_ismaclabel(const char *name); int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen); @@ -1339,14 +1338,14 @@ static inline void security_d_instantiate(struct dentry *dentry, struct inode *inode) { } -static inline int security_getprocattr(struct task_struct *p, const char *lsm, +static inline int security_getprocattr(struct task_struct *p, int lsmid, const char *name, char **value) { return -EINVAL; } -static inline int security_setprocattr(const char *lsm, char *name, - void *value, size_t size) +static inline int security_setprocattr(int lsmid, char *name, void *value, + size_t size) { return -EINVAL; } -- cgit v1.2.3 From a04a1198088a1378d0389c250cc684f649bcc91e Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:49 -0700 Subject: LSM: syscalls for current process attributes Create a system call lsm_get_self_attr() to provide the security module maintained attributes of the current process. Create a system call lsm_set_self_attr() to set a security module maintained attribute of the current process. Historically these attributes have been exposed to user space via entries in procfs under /proc/self/attr. The attribute value is provided in a lsm_ctx structure. The structure identifies the size of the attribute, and the attribute value. The format of the attribute value is defined by the security module. A flags field is included for LSM specific information. It is currently unused and must be 0. The total size of the data, including the lsm_ctx structure and any padding, is maintained as well. struct lsm_ctx { __u64 id; __u64 flags; __u64 len; __u64 ctx_len; __u8 ctx[]; }; Two new LSM hooks are used to interface with the LSMs. security_getselfattr() collects the lsm_ctx values from the LSMs that support the hook, accounting for space requirements. security_setselfattr() identifies which LSM the attribute is intended for and passes it along. Signed-off-by: Casey Schaufler Reviewed-by: Kees Cook Reviewed-by: Serge Hallyn Reviewed-by: John Johansen Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 4 ++++ include/linux/lsm_hooks.h | 1 + include/linux/security.h | 19 +++++++++++++++++++ include/linux/syscalls.h | 5 +++++ 4 files changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index ff217a5ce5521..c925a0d26edfe 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -262,6 +262,10 @@ LSM_HOOK(int, 0, sem_semop, struct kern_ipc_perm *perm, struct sembuf *sops, LSM_HOOK(int, 0, netlink_send, struct sock *sk, struct sk_buff *skb) LSM_HOOK(void, LSM_RET_VOID, d_instantiate, struct dentry *dentry, struct inode *inode) +LSM_HOOK(int, -EOPNOTSUPP, getselfattr, unsigned int attr, + struct lsm_ctx __user *ctx, size_t *size, u32 flags) +LSM_HOOK(int, -EOPNOTSUPP, setselfattr, unsigned int attr, + struct lsm_ctx *ctx, size_t size, u32 flags) LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, const char *name, char **value) LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 7f0adb33caaae..a2ade0ffe9e7d 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -25,6 +25,7 @@ #ifndef __LINUX_LSM_HOOKS_H #define __LINUX_LSM_HOOKS_H +#include #include #include #include diff --git a/include/linux/security.h b/include/linux/security.h index c81bca77f4f21..dd1fe487385db 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -60,6 +60,7 @@ struct fs_parameter; enum fs_value_type; struct watch; struct watch_notification; +struct lsm_ctx; /* Default (no) options for the capable function */ #define CAP_OPT_NONE 0x0 @@ -472,6 +473,10 @@ int security_sem_semctl(struct kern_ipc_perm *sma, int cmd); int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops, unsigned nsops, int alter); void security_d_instantiate(struct dentry *dentry, struct inode *inode); +int security_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx, + size_t __user *size, u32 flags); +int security_setselfattr(unsigned int attr, struct lsm_ctx __user *ctx, + size_t size, u32 flags); int security_getprocattr(struct task_struct *p, int lsmid, const char *name, char **value); int security_setprocattr(int lsmid, const char *name, void *value, size_t size); @@ -1338,6 +1343,20 @@ static inline void security_d_instantiate(struct dentry *dentry, struct inode *inode) { } +static inline int security_getselfattr(unsigned int attr, + struct lsm_ctx __user *ctx, + size_t __user *size, u32 flags) +{ + return -EOPNOTSUPP; +} + +static inline int security_setselfattr(unsigned int attr, + struct lsm_ctx __user *ctx, + size_t size, u32 flags) +{ + return -EOPNOTSUPP; +} + static inline int security_getprocattr(struct task_struct *p, int lsmid, const char *name, char **value) { diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index fd9d12de7e929..4e1e56a24f1e7 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -71,6 +71,7 @@ struct clone_args; struct open_how; struct mount_attr; struct landlock_ruleset_attr; +struct lsm_ctx; enum landlock_rule_type; struct cachestat_range; struct cachestat; @@ -949,6 +950,10 @@ asmlinkage long sys_cachestat(unsigned int fd, struct cachestat_range __user *cstat_range, struct cachestat __user *cstat, unsigned int flags); asmlinkage long sys_map_shadow_stack(unsigned long addr, unsigned long size, unsigned int flags); +asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx *ctx, + size_t *size, __u32 flags); +asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx *ctx, + size_t size, __u32 flags); /* * Architecture-specific system calls -- cgit v1.2.3 From ad4aff9ec25f400608283c10d634cc4eeda83a02 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:50 -0700 Subject: LSM: Create lsm_list_modules system call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create a system call to report the list of Linux Security Modules that are active on the system. The list is provided as an array of LSM ID numbers. The calling application can use this list determine what LSM specific actions it might take. That might include choosing an output format, determining required privilege or bypassing security module specific behavior. Signed-off-by: Casey Schaufler Reviewed-by: Kees Cook Reviewed-by: Serge Hallyn Reviewed-by: John Johansen Reviewed-by: Mickaël Salaün Signed-off-by: Paul Moore --- include/linux/syscalls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 4e1e56a24f1e7..feec5719750be 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -954,6 +954,7 @@ asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx *ctx, size_t *size, __u32 flags); asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx *ctx, size_t size, __u32 flags); +asmlinkage long sys_lsm_list_modules(u64 *ids, size_t *size, u32 flags); /* * Architecture-specific system calls -- cgit v1.2.3 From e1ca7129db2c3b3c4d261702905a752e6b2710b4 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 12 Sep 2023 13:56:52 -0700 Subject: LSM: Helpers for attribute names and filling lsm_ctx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add lsm_name_to_attr(), which translates a text string to a LSM_ATTR value if one is available. Add lsm_fill_user_ctx(), which fills a struct lsm_ctx, including the trailing attribute value. Both are used in module specific components of LSM system calls. Signed-off-by: Casey Schaufler Reviewed-by: John Johansen Reviewed-by: Serge Hallyn Reviewed-by: Mickaël Salaün Signed-off-by: Paul Moore --- include/linux/security.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index dd1fe487385db..334f75aa72899 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -32,6 +32,7 @@ #include #include #include +#include struct linux_binprm; struct cred; @@ -264,6 +265,7 @@ int unregister_blocking_lsm_notifier(struct notifier_block *nb); /* prototypes */ extern int security_init(void); extern int early_security_init(void); +extern u64 lsm_name_to_attr(const char *name); /* Security operations */ int security_binder_set_context_mgr(const struct cred *mgr); @@ -490,6 +492,8 @@ int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen); int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen); int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen); int security_locked_down(enum lockdown_reason what); +int lsm_fill_user_ctx(struct lsm_ctx __user *ctx, void *context, + size_t context_size, u64 id, u64 flags); #else /* CONFIG_SECURITY */ static inline int call_blocking_lsm_notifier(enum lsm_event event, void *data) @@ -507,6 +511,11 @@ static inline int unregister_blocking_lsm_notifier(struct notifier_block *nb) return 0; } +static inline u64 lsm_name_to_attr(const char *name) +{ + return LSM_ATTR_UNDEF; +} + static inline void security_free_mnt_opts(void **mnt_opts) { } @@ -1415,6 +1424,11 @@ static inline int security_locked_down(enum lockdown_reason what) { return 0; } +static inline int lsm_fill_user_ctx(struct lsm_ctx __user *ctx, void *context, + size_t context_size, u64 id, u64 flags) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_SECURITY */ #if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE) -- cgit v1.2.3 From d7cf3412a9f6c547e5ee443fa7644e08898aa3e2 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 24 Oct 2023 14:44:00 -0400 Subject: lsm: consolidate buffer size handling into lsm_fill_user_ctx() While we have a lsm_fill_user_ctx() helper function designed to make life easier for LSMs which return lsm_ctx structs to userspace, we didn't include all of the buffer length safety checks and buffer padding adjustments in the helper. This led to code duplication across the different LSMs and the possibility for mistakes across the different LSM subsystems. In order to reduce code duplication and decrease the chances of silly mistakes, we're consolidating all of this code into the lsm_fill_user_ctx() helper. The buffer padding is also modified from a fixed 8-byte alignment to an alignment that matches the word length of the machine (BITS_PER_LONG / 8). Signed-off-by: Paul Moore --- include/linux/security.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 334f75aa72899..750130a7b9dd2 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -492,8 +492,8 @@ int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen); int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen); int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen); int security_locked_down(enum lockdown_reason what); -int lsm_fill_user_ctx(struct lsm_ctx __user *ctx, void *context, - size_t context_size, u64 id, u64 flags); +int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, size_t *uctx_len, + void *val, size_t val_len, u64 id, u64 flags); #else /* CONFIG_SECURITY */ static inline int call_blocking_lsm_notifier(enum lsm_event event, void *data) @@ -1424,8 +1424,9 @@ static inline int security_locked_down(enum lockdown_reason what) { return 0; } -static inline int lsm_fill_user_ctx(struct lsm_ctx __user *ctx, void *context, - size_t context_size, u64 id, u64 flags) +static inline int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, + size_t *uctx_len, void *val, size_t val_len, + u64 id, u64 flags) { return -EOPNOTSUPP; } -- cgit v1.2.3 From 8569992d64b8f750e34b7858eac5d7daaf0f80fd Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:21:45 -0700 Subject: KVM: Use gfn instead of hva for mmu_notifier_retry Currently in mmu_notifier invalidate path, hva range is recorded and then checked against by mmu_invalidate_retry_hva() in the page fault handling path. However, for the soon-to-be-introduced private memory, a page fault may not have a hva associated, checking gfn(gpa) makes more sense. For existing hva based shared memory, gfn is expected to also work. The only downside is when aliasing multiple gfns to a single hva, the current algorithm of checking multiple ranges could result in a much larger range being rejected. Such aliasing should be uncommon, so the impact is expected small. Suggested-by: Sean Christopherson Cc: Xu Yilun Signed-off-by: Chao Peng Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba [sean: convert vmx_set_apic_access_page_addr() to gfn-based API] Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Xu Yilun Message-Id: <20231027182217.3615211-4-seanjc@google.com> Reviewed-by: Kai Huang Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index fb6c6109fdcad..11d0916883460 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -787,8 +787,8 @@ struct kvm { struct mmu_notifier mmu_notifier; unsigned long mmu_invalidate_seq; long mmu_invalidate_in_progress; - unsigned long mmu_invalidate_range_start; - unsigned long mmu_invalidate_range_end; + gfn_t mmu_invalidate_range_start; + gfn_t mmu_invalidate_range_end; #endif struct list_head devices; u64 manual_dirty_log_protect; @@ -1392,10 +1392,9 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc); void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); #endif -void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start, - unsigned long end); -void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start, - unsigned long end); +void kvm_mmu_invalidate_begin(struct kvm *kvm); +void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end); +void kvm_mmu_invalidate_end(struct kvm *kvm); long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); @@ -1970,9 +1969,9 @@ static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq) return 0; } -static inline int mmu_invalidate_retry_hva(struct kvm *kvm, +static inline int mmu_invalidate_retry_gfn(struct kvm *kvm, unsigned long mmu_seq, - unsigned long hva) + gfn_t gfn) { lockdep_assert_held(&kvm->mmu_lock); /* @@ -1981,10 +1980,20 @@ static inline int mmu_invalidate_retry_hva(struct kvm *kvm, * that might be being invalidated. Note that it may include some false * positives, due to shortcuts when handing concurrent invalidations. */ - if (unlikely(kvm->mmu_invalidate_in_progress) && - hva >= kvm->mmu_invalidate_range_start && - hva < kvm->mmu_invalidate_range_end) - return 1; + if (unlikely(kvm->mmu_invalidate_in_progress)) { + /* + * Dropping mmu_lock after bumping mmu_invalidate_in_progress + * but before updating the range is a KVM bug. + */ + if (WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA || + kvm->mmu_invalidate_range_end == INVALID_GPA)) + return 1; + + if (gfn >= kvm->mmu_invalidate_range_start && + gfn < kvm->mmu_invalidate_range_end) + return 1; + } + if (kvm->mmu_invalidate_seq != mmu_seq) return 1; return 0; -- cgit v1.2.3 From f128cf8cfbecccf95e891ae90d9c917df5117c7a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:49 -0700 Subject: KVM: Convert KVM_ARCH_WANT_MMU_NOTIFIER to CONFIG_KVM_GENERIC_MMU_NOTIFIER Convert KVM_ARCH_WANT_MMU_NOTIFIER into a Kconfig and select it where appropriate to effectively maintain existing behavior. Using a proper Kconfig will simplify building more functionality on top of KVM's mmu_notifier infrastructure. Add a forward declaration of kvm_gfn_range to kvm_types.h so that including arch/powerpc/include/asm/kvm_ppc.h's with CONFIG_KVM=n doesn't generate warnings due to kvm_gfn_range being undeclared. PPC defines hooks for PR vs. HV without guarding them via #ifdeffery, e.g. bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range); bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); Alternatively, PPC could forward declare kvm_gfn_range, but there's no good reason not to define it in common KVM. Acked-by: Anup Patel Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 6 +++--- include/linux/kvm_types.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 11d0916883460..5faba69403ac4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -253,7 +253,7 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif -#ifdef KVM_ARCH_WANT_MMU_NOTIFIER +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER union kvm_mmu_notifier_arg { pte_t pte; }; @@ -783,7 +783,7 @@ struct kvm { struct hlist_head irq_ack_notifier_list; #endif -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER struct mmu_notifier mmu_notifier; unsigned long mmu_invalidate_seq; long mmu_invalidate_in_progress; @@ -1946,7 +1946,7 @@ extern const struct _kvm_stats_desc kvm_vm_stats_desc[]; extern const struct kvm_stats_header kvm_vcpu_stats_header; extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[]; -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq) { if (unlikely(kvm->mmu_invalidate_in_progress)) diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 6f4737d5046a4..9d1f7835d8c13 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -6,6 +6,7 @@ struct kvm; struct kvm_async_pf; struct kvm_device_ops; +struct kvm_gfn_range; struct kvm_interrupt; struct kvm_irq_routing_table; struct kvm_memory_slot; -- cgit v1.2.3 From bb58b90b1a8f753b582055adaf448214a8e22c31 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:50 -0700 Subject: KVM: Introduce KVM_SET_USER_MEMORY_REGION2 Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional information can be supplied without setting userspace up to fail. The padding in the new kvm_userspace_memory_region2 structure will be used to pass a file descriptor in addition to the userspace_addr, i.e. allow userspace to point at a file descriptor and map memory into a guest that is NOT mapped into host userspace. Alternatively, KVM could simply add "struct kvm_userspace_memory_region2" without a new ioctl(), but as Paolo pointed out, adding a new ioctl() makes detection of bad flags a bit more robust, e.g. if the new fd field is guarded only by a flag and not a new ioctl(), then a userspace bug (setting a "bad" flag) would generate out-of-bounds access instead of an -EINVAL error. Cc: Jarkko Sakkinen Reviewed-by: Paolo Bonzini Reviewed-by: Xiaoyao Li Signed-off-by: Sean Christopherson Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-9-seanjc@google.com> Acked-by: Kai Huang Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5faba69403ac4..4e741ff27af36 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1146,9 +1146,9 @@ enum kvm_mr_change { }; int kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem); + const struct kvm_userspace_memory_region2 *mem); int __kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem); + const struct kvm_userspace_memory_region2 *mem); void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); int kvm_arch_prepare_memory_region(struct kvm *kvm, -- cgit v1.2.3 From 16f95f3b95caded251a0440051e44a2fbe9e5f55 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:21:51 -0700 Subject: KVM: Add KVM_EXIT_MEMORY_FAULT exit to report faults to userspace Add a new KVM exit type to allow userspace to handle memory faults that KVM cannot resolve, but that userspace *may* be able to handle (without terminating the guest). KVM will initially use KVM_EXIT_MEMORY_FAULT to report implicit conversions between private and shared memory. With guest private memory, there will be two kind of memory conversions: - explicit conversion: happens when the guest explicitly calls into KVM to map a range (as private or shared) - implicit conversion: happens when the guest attempts to access a gfn that is configured in the "wrong" state (private vs. shared) On x86 (first architecture to support guest private memory), explicit conversions will be reported via KVM_EXIT_HYPERCALL+KVM_HC_MAP_GPA_RANGE, but reporting KVM_EXIT_HYPERCALL for implicit conversions is undesriable as there is (obviously) no hypercall, and there is no guarantee that the guest actually intends to convert between private and shared, i.e. what KVM thinks is an implicit conversion "request" could actually be the result of a guest code bug. KVM_EXIT_MEMORY_FAULT will be used to report memory faults that appear to be implicit conversions. Note! To allow for future possibilities where KVM reports KVM_EXIT_MEMORY_FAULT and fills run->memory_fault on _any_ unresolved fault, KVM returns "-EFAULT" (-1 with errno == EFAULT from userspace's perspective), not '0'! Due to historical baggage within KVM, exiting to userspace with '0' from deep callstacks, e.g. in emulation paths, is infeasible as doing so would require a near-complete overhaul of KVM, whereas KVM already propagates -errno return codes to userspace even when the -errno originated in a low level helper. Report the gpa+size instead of a single gfn even though the initial usage is expected to always report single pages. It's entirely possible, likely even, that KVM will someday support sub-page granularity faults, e.g. Intel's sub-page protection feature allows for additional protections at 128-byte granularity. Link: https://lore.kernel.org/all/20230908222905.1321305-5-amoorthy@google.com Link: https://lore.kernel.org/all/ZQ3AmLO2SYv3DszH@google.com Cc: Anish Moorthy Cc: David Matlack Suggested-by: Sean Christopherson Co-developed-by: Yu Zhang Signed-off-by: Yu Zhang Signed-off-by: Chao Peng Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Message-Id: <20231027182217.3615211-10-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Reviewed-by: Xiaoyao Li Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4e741ff27af36..96aa930536b16 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2327,4 +2327,15 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr) /* Max number of entries allowed for each kvm dirty ring */ #define KVM_DIRTY_RING_MAX_ENTRIES 65536 +static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, + gpa_t gpa, gpa_t size) +{ + vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; + vcpu->run->memory_fault.gpa = gpa; + vcpu->run->memory_fault.size = size; + + /* Flags are not (yet) defined or communicated to userspace. */ + vcpu->run->memory_fault.flags = 0; +} + #endif -- cgit v1.2.3 From 5a475554db1e476a14216e742ea2bdb77362d5d5 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:21:55 -0700 Subject: KVM: Introduce per-page memory attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In confidential computing usages, whether a page is private or shared is necessary information for KVM to perform operations like page fault handling, page zapping etc. There are other potential use cases for per-page memory attributes, e.g. to make memory read-only (or no-exec, or exec-only, etc.) without having to modify memslots. Introduce the KVM_SET_MEMORY_ATTRIBUTES ioctl, advertised by KVM_CAP_MEMORY_ATTRIBUTES, to allow userspace to set the per-page memory attributes to a guest memory range. Use an xarray to store the per-page attributes internally, with a naive, not fully optimized implementation, i.e. prioritize correctness over performance for the initial implementation. Use bit 3 for the PRIVATE attribute so that KVM can use bits 0-2 for RWX attributes/protections in the future, e.g. to give userspace fine-grained control over read, write, and execute protections for guest memory. Provide arch hooks for handling attribute changes before and after common code sets the new attributes, e.g. x86 will use the "pre" hook to zap all relevant mappings, and the "post" hook to track whether or not hugepages can be used to map the range. To simplify the implementation wrap the entire sequence with kvm_mmu_invalidate_{begin,end}() even though the operation isn't strictly guaranteed to be an invalidation. For the initial use case, x86 *will* always invalidate memory, and preventing arch code from creating new mappings while the attributes are in flux makes it much easier to reason about the correctness of consuming attributes. It's possible that future usages may not require an invalidation, e.g. if KVM ends up supporting RWX protections and userspace grants _more_ protections, but again opt for simplicity and punt optimizations to if/when they are needed. Suggested-by: Sean Christopherson Link: https://lore.kernel.org/all/Y2WB48kD0J4VGynX@google.com Cc: Fuad Tabba Cc: Xu Yilun Cc: Mickaël Salaün Signed-off-by: Chao Peng Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-14-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 96aa930536b16..68a144cb7dbc6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -256,6 +256,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER union kvm_mmu_notifier_arg { pte_t pte; + unsigned long attributes; }; struct kvm_gfn_range { @@ -806,6 +807,10 @@ struct kvm { #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER struct notifier_block pm_notifier; +#endif +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES + /* Protected by slots_locks (for writes) and RCU (for reads) */ + struct xarray mem_attr_array; #endif char stats_id[KVM_STATS_NAME_SIZE]; }; @@ -2338,4 +2343,18 @@ static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, vcpu->run->memory_fault.flags = 0; } +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn) +{ + return xa_to_value(xa_load(&kvm->mem_attr_array, gfn)); +} + +bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end, + unsigned long attrs); +bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, + struct kvm_gfn_range *range); +bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, + struct kvm_gfn_range *range); +#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ + #endif -- cgit v1.2.3 From 0003e2a414687fff6a75250d381e4abf345d663f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:56 -0700 Subject: mm: Add AS_UNMOVABLE to mark mapping as completely unmovable Add an "unmovable" flag for mappings that cannot be migrated under any circumstance. KVM will use the flag for its upcoming GUEST_MEMFD support, which will not support compaction/migration, at least not in the foreseeable future. Test AS_UNMOVABLE under folio lock as already done for the async compaction/dirty folio case, as the mapping can be removed by truncation while compaction is running. To avoid having to lock every folio with a mapping, assume/require that unmovable mappings are also unevictable, and have mapping_set_unmovable() also set AS_UNEVICTABLE. Cc: Matthew Wilcox Co-developed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-15-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/pagemap.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 351c3b7f93a14..82c9bf506b79c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -203,7 +203,8 @@ enum mapping_flags { /* writeback related tags are not used */ AS_NO_WRITEBACK_TAGS = 5, AS_LARGE_FOLIO_SUPPORT = 6, - AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ + AS_RELEASE_ALWAYS = 7, /* Call ->release_folio(), even if no private data */ + AS_UNMOVABLE = 8, /* The mapping cannot be moved, ever */ }; /** @@ -289,6 +290,22 @@ static inline void mapping_clear_release_always(struct address_space *mapping) clear_bit(AS_RELEASE_ALWAYS, &mapping->flags); } +static inline void mapping_set_unmovable(struct address_space *mapping) +{ + /* + * It's expected unmovable mappings are also unevictable. Compaction + * migrate scanner (isolate_migratepages_block()) relies on this to + * reduce page locking. + */ + set_bit(AS_UNEVICTABLE, &mapping->flags); + set_bit(AS_UNMOVABLE, &mapping->flags); +} + +static inline bool mapping_unmovable(struct address_space *mapping) +{ + return test_bit(AS_UNMOVABLE, &mapping->flags); +} + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return mapping->gfp_mask; -- cgit v1.2.3 From 3fad96e9b21bed214c1593d7d7fb3e40d1fbf6f4 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Tue, 24 Oct 2023 11:57:15 +0100 Subject: firmware: arm_ffa: Declare ffa_bus_type structure in the header smatch reports: drivers/firmware/arm_ffa/bus.c:108:17: warning: symbol 'ffa_bus_type' was not declared. Should it be static? ffa_bus_type is exported to be useful in the FF-A driver. So this warning is not correct. However, declaring the ffa_bus_type structure in the header like many other bus_types do already removes this warning. So let us just do the same and get rid of the warning. Link: https://lore.kernel.org/r/20231024105715.2369638-1-sudeep.holla@arm.com Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index 1abedb5b2e48f..3d0fde57ba90e 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -209,6 +209,8 @@ bool ffa_device_is_valid(struct ffa_device *ffa_dev) { return false; } #define module_ffa_driver(__ffa_driver) \ module_driver(__ffa_driver, ffa_register, ffa_unregister) +extern struct bus_type ffa_bus_type; + /* FFA transport related */ struct ffa_partition_info { u16 id; -- cgit v1.2.3 From 4f0b9194bc119a9850a99e5e824808e2f468c348 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 3 Nov 2023 06:47:51 -0400 Subject: fs: Rename anon_inode_getfile_secure() and anon_inode_getfd_secure() The call to the inode_init_security_anon() LSM hook is not the sole reason to use anon_inode_getfile_secure() or anon_inode_getfd_secure(). For example, the functions also allow one to create a file with non-zero size, without needing a full-blown filesystem. In this case, you don't need a "secure" version, just unique inodes; the current name of the functions is confusing and does not explain well the difference with the more "standard" anon_inode_getfile() and anon_inode_getfd(). Of course, there is another side of the coin; neither io_uring nor userfaultfd strictly speaking need distinct inodes, and it is not that clear anymore that anon_inode_create_get{file,fd}() allow the LSM to intercept and block the inode's creation. If one was so inclined, anon_inode_getfile_secure() and anon_inode_getfd_secure() could be kept, using the shared inode or a new one depending on CONFIG_SECURITY. However, this is probably overkill, and potentially a cause of bugs in different configurations. Therefore, just add a comment to io_uring and userfaultfd explaining the choice of the function. While at it, remove the export for what is now anon_inode_create_getfd(). There is no in-tree module that uses it, and the old name is gone anyway. If anybody actually needs the symbol, they can ask or they can just use anon_inode_create_getfile(), which will be exported very soon for use in KVM. Suggested-by: Christian Brauner Reviewed-by: Christian Brauner Signed-off-by: Paolo Bonzini --- include/linux/anon_inodes.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h index 5deaddbd79278..93a5f16d03f3f 100644 --- a/include/linux/anon_inodes.h +++ b/include/linux/anon_inodes.h @@ -15,13 +15,13 @@ struct inode; struct file *anon_inode_getfile(const char *name, const struct file_operations *fops, void *priv, int flags); -struct file *anon_inode_getfile_secure(const char *name, +struct file *anon_inode_create_getfile(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode); int anon_inode_getfd(const char *name, const struct file_operations *fops, void *priv, int flags); -int anon_inode_getfd_secure(const char *name, +int anon_inode_create_getfd(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode); -- cgit v1.2.3 From a7800aa80ea4d5356b8474c2302812e9d4926fa6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 13 Nov 2023 05:42:34 -0500 Subject: KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce an ioctl(), KVM_CREATE_GUEST_MEMFD, to allow creating file-based memory that is tied to a specific KVM virtual machine and whose primary purpose is to serve guest memory. A guest-first memory subsystem allows for optimizations and enhancements that are kludgy or outright infeasible to implement/support in a generic memory subsystem. With guest_memfd, guest protections and mapping sizes are fully decoupled from host userspace mappings. E.g. KVM currently doesn't support mapping memory as writable in the guest without it also being writable in host userspace, as KVM's ABI uses VMA protections to define the allow guest protection. Userspace can fudge this by establishing two mappings, a writable mapping for the guest and readable one for itself, but that’s suboptimal on multiple fronts. Similarly, KVM currently requires the guest mapping size to be a strict subset of the host userspace mapping size, e.g. KVM doesn’t support creating a 1GiB guest mapping unless userspace also has a 1GiB guest mapping. Decoupling the mappings sizes would allow userspace to precisely map only what is needed without impacting guest performance, e.g. to harden against unintentional accesses to guest memory. Decoupling guest and userspace mappings may also allow for a cleaner alternative to high-granularity mappings for HugeTLB, which has reached a bit of an impasse and is unlikely to ever be merged. A guest-first memory subsystem also provides clearer line of sight to things like a dedicated memory pool (for slice-of-hardware VMs) and elimination of "struct page" (for offload setups where userspace _never_ needs to mmap() guest memory). More immediately, being able to map memory into KVM guests without mapping said memory into the host is critical for Confidential VMs (CoCo VMs), the initial use case for guest_memfd. While AMD's SEV and Intel's TDX prevent untrusted software from reading guest private data by encrypting guest memory with a key that isn't usable by the untrusted host, projects such as Protected KVM (pKVM) provide confidentiality and integrity *without* relying on memory encryption. And with SEV-SNP and TDX, accessing guest private memory can be fatal to the host, i.e. KVM must be prevent host userspace from accessing guest memory irrespective of hardware behavior. Attempt #1 to support CoCo VMs was to add a VMA flag to mark memory as being mappable only by KVM (or a similarly enlightened kernel subsystem). That approach was abandoned largely due to it needing to play games with PROT_NONE to prevent userspace from accessing guest memory. Attempt #2 to was to usurp PG_hwpoison to prevent the host from mapping guest private memory into userspace, but that approach failed to meet several requirements for software-based CoCo VMs, e.g. pKVM, as the kernel wouldn't easily be able to enforce a 1:1 page:guest association, let alone a 1:1 pfn:gfn mapping. And using PG_hwpoison does not work for memory that isn't backed by 'struct page', e.g. if devices gain support for exposing encrypted memory regions to guests. Attempt #3 was to extend the memfd() syscall and wrap shmem to provide dedicated file-based guest memory. That approach made it as far as v10 before feedback from Hugh Dickins and Christian Brauner (and others) led to it demise. Hugh's objection was that piggybacking shmem made no sense for KVM's use case as KVM didn't actually *want* the features provided by shmem. I.e. KVM was using memfd() and shmem to avoid having to manage memory directly, not because memfd() and shmem were the optimal solution, e.g. things like read/write/mmap in shmem were dead weight. Christian pointed out flaws with implementing a partial overlay (wrapping only _some_ of shmem), e.g. poking at inode_operations or super_operations would show shmem stuff, but address_space_operations and file_operations would show KVM's overlay. Paraphrashing heavily, Christian suggested KVM stop being lazy and create a proper API. Link: https://lore.kernel.org/all/20201020061859.18385-1-kirill.shutemov@linux.intel.com Link: https://lore.kernel.org/all/20210416154106.23721-1-kirill.shutemov@linux.intel.com Link: https://lore.kernel.org/all/20210824005248.200037-1-seanjc@google.com Link: https://lore.kernel.org/all/20211111141352.26311-1-chao.p.peng@linux.intel.com Link: https://lore.kernel.org/all/20221202061347.1070246-1-chao.p.peng@linux.intel.com Link: https://lore.kernel.org/all/ff5c5b97-acdf-9745-ebe5-c6609dd6322e@google.com Link: https://lore.kernel.org/all/20230418-anfallen-irdisch-6993a61be10b@brauner Link: https://lore.kernel.org/all/ZEM5Zq8oo+xnApW9@google.com Link: https://lore.kernel.org/linux-mm/20230306191944.GA15773@monkey Link: https://lore.kernel.org/linux-mm/ZII1p8ZHlHaQ3dDl@casper.infradead.org Cc: Fuad Tabba Cc: Vishal Annapurve Cc: Ackerley Tng Cc: Jarkko Sakkinen Cc: Maciej Szmigiero Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Quentin Perret Cc: Michael Roth Cc: Wang Cc: Liam Merwick Cc: Isaku Yamahata Co-developed-by: Kirill A. Shutemov Signed-off-by: Kirill A. Shutemov Co-developed-by: Yu Zhang Signed-off-by: Yu Zhang Co-developed-by: Chao Peng Signed-off-by: Chao Peng Co-developed-by: Ackerley Tng Signed-off-by: Ackerley Tng Co-developed-by: Isaku Yamahata Signed-off-by: Isaku Yamahata Co-developed-by: Paolo Bonzini Signed-off-by: Paolo Bonzini Co-developed-by: Michael Roth Signed-off-by: Michael Roth Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-17-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Reviewed-by: Xiaoyao Li Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 68a144cb7dbc6..a6de526c04267 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -589,8 +589,20 @@ struct kvm_memory_slot { u32 flags; short id; u16 as_id; + +#ifdef CONFIG_KVM_PRIVATE_MEM + struct { + struct file __rcu *file; + pgoff_t pgoff; + } gmem; +#endif }; +static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot) +{ + return slot && (slot->flags & KVM_MEM_GUEST_MEMFD); +} + static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot) { return slot->flags & KVM_MEM_LOG_DIRTY_PAGES; @@ -685,6 +697,17 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) } #endif +/* + * Arch code must define kvm_arch_has_private_mem if support for private memory + * is enabled. + */ +#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) +static inline bool kvm_arch_has_private_mem(struct kvm *kvm) +{ + return false; +} +#endif + struct kvm_memslots { u64 generation; atomic_long_t last_used_slot; @@ -1400,6 +1423,7 @@ void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); void kvm_mmu_invalidate_begin(struct kvm *kvm); void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end); void kvm_mmu_invalidate_end(struct kvm *kvm); +bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); @@ -2355,6 +2379,30 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range); + +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) +{ + return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) && + kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE; +} +#else +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) +{ + return false; +} #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ +#ifdef CONFIG_KVM_PRIVATE_MEM +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *max_order); +#else +static inline int kvm_gmem_get_pfn(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn, + kvm_pfn_t *pfn, int *max_order) +{ + KVM_BUG_ON(1, kvm); + return -EIO; +} +#endif /* CONFIG_KVM_PRIVATE_MEM */ + #endif -- cgit v1.2.3 From 8dd2eee9d526c30fccfe75da7ec5365c6476e510 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:22:02 -0700 Subject: KVM: x86/mmu: Handle page fault for private memory Add support for resolving page faults on guest private memory for VMs that differentiate between "shared" and "private" memory. For such VMs, KVM_MEM_GUEST_MEMFD memslots can include both fd-based private memory and hva-based shared memory, and KVM needs to map in the "correct" variant, i.e. KVM needs to map the gfn shared/private as appropriate based on the current state of the gfn's KVM_MEMORY_ATTRIBUTE_PRIVATE flag. For AMD's SEV-SNP and Intel's TDX, the guest effectively gets to request shared vs. private via a bit in the guest page tables, i.e. what the guest wants may conflict with the current memory attributes. To support such "implicit" conversion requests, exit to user with KVM_EXIT_MEMORY_FAULT to forward the request to userspace. Add a new flag for memory faults, KVM_MEMORY_EXIT_FLAG_PRIVATE, to communicate whether the guest wants to map memory as shared vs. private. Like KVM_MEMORY_ATTRIBUTE_PRIVATE, use bit 3 for flagging private memory so that KVM can use bits 0-2 for capturing RWX behavior if/when userspace needs such information, e.g. a likely user of KVM_EXIT_MEMORY_FAULT is to exit on missing mappings when handling guest page fault VM-Exits. In that case, userspace will want to know RWX information in order to correctly/precisely resolve the fault. Note, private memory *must* be backed by guest_memfd, i.e. shared mappings always come from the host userspace page tables, and private mappings always come from a guest_memfd instance. Co-developed-by: Yu Zhang Signed-off-by: Yu Zhang Signed-off-by: Chao Peng Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-21-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a6de526c04267..67dfd4d79529f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2357,14 +2357,18 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr) #define KVM_DIRTY_RING_MAX_ENTRIES 65536 static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, - gpa_t gpa, gpa_t size) + gpa_t gpa, gpa_t size, + bool is_write, bool is_exec, + bool is_private) { vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; vcpu->run->memory_fault.gpa = gpa; vcpu->run->memory_fault.size = size; - /* Flags are not (yet) defined or communicated to userspace. */ + /* RWX flags are not (yet) defined or communicated to userspace. */ vcpu->run->memory_fault.flags = 0; + if (is_private) + vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE; } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES -- cgit v1.2.3 From 2333afa17af0f4b6651214ee17cfd5ae5f47787a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:03 -0700 Subject: KVM: Drop superfluous __KVM_VCPU_MULTIPLE_ADDRESS_SPACE macro Drop __KVM_VCPU_MULTIPLE_ADDRESS_SPACE and instead check the value of KVM_ADDRESS_SPACE_NUM. No functional change intended. Reviewed-by: Paolo Bonzini Signed-off-by: Sean Christopherson Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-22-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 67dfd4d79529f..db423ea9e3a44 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -690,7 +690,7 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm); #define KVM_MEM_SLOTS_NUM SHRT_MAX #define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS) -#ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE +#if KVM_ADDRESS_SPACE_NUM == 1 static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) { return 0; -- cgit v1.2.3 From eed52e434bc33603ddb0af62b6c4ef818948489d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:04 -0700 Subject: KVM: Allow arch code to track number of memslot address spaces per VM Let x86 track the number of address spaces on a per-VM basis so that KVM can disallow SMM memslots for confidential VMs. Confidentials VMs are fundamentally incompatible with emulating SMM, which as the name suggests requires being able to read and write guest memory and register state. Disallowing SMM will simplify support for guest private memory, as KVM will not need to worry about tracking memory attributes for multiple address spaces (SMM is the only "non-default" address space across all architectures). Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-23-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index db423ea9e3a44..3ebc6912c54a1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -80,8 +80,8 @@ /* Two fragments for cross MMIO pages. */ #define KVM_MAX_MMIO_FRAGMENTS 2 -#ifndef KVM_ADDRESS_SPACE_NUM -#define KVM_ADDRESS_SPACE_NUM 1 +#ifndef KVM_MAX_NR_ADDRESS_SPACES +#define KVM_MAX_NR_ADDRESS_SPACES 1 #endif /* @@ -690,7 +690,12 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm); #define KVM_MEM_SLOTS_NUM SHRT_MAX #define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS) -#if KVM_ADDRESS_SPACE_NUM == 1 +#if KVM_MAX_NR_ADDRESS_SPACES == 1 +static inline int kvm_arch_nr_memslot_as_ids(struct kvm *kvm) +{ + return KVM_MAX_NR_ADDRESS_SPACES; +} + static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) { return 0; @@ -745,9 +750,9 @@ struct kvm { struct mm_struct *mm; /* userspace tied to this vm */ unsigned long nr_memslot_pages; /* The two memslot sets - active and inactive (per address space) */ - struct kvm_memslots __memslots[KVM_ADDRESS_SPACE_NUM][2]; + struct kvm_memslots __memslots[KVM_MAX_NR_ADDRESS_SPACES][2]; /* The current active memslot set for each address space */ - struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM]; + struct kvm_memslots __rcu *memslots[KVM_MAX_NR_ADDRESS_SPACES]; struct xarray vcpu_array; /* * Protected by slots_lock, but can be read outside if an @@ -1017,7 +1022,7 @@ void kvm_put_kvm_no_destroy(struct kvm *kvm); static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id) { - as_id = array_index_nospec(as_id, KVM_ADDRESS_SPACE_NUM); + as_id = array_index_nospec(as_id, KVM_MAX_NR_ADDRESS_SPACES); return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu, lockdep_is_held(&kvm->slots_lock) || !refcount_read(&kvm->users_count)); -- cgit v1.2.3 From 84db47ca7146d7bd00eb5cf2b93989a971c84650 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Fri, 20 Oct 2023 21:27:46 +0530 Subject: sched/numa: Fix mm numa_scan_seq based unconditional scan Since commit fc137c0ddab2 ("sched/numa: enhance vma scanning logic") NUMA Balancing allows updating PTEs to trap NUMA hinting faults if the task had previously accessed VMA. However unconditional scan of VMAs are allowed during initial phase of VMA creation until process's mm numa_scan_seq reaches 2 even though current task had not accessed VMA. Rationale: - Without initial scan subsequent PTE update may never happen. - Give fair opportunity to all the VMAs to be scanned and subsequently understand the access pattern of all the VMAs. But it has a corner case where, if a VMA is created after some time, process's mm numa_scan_seq could be already greater than 2. For e.g., values of mm numa_scan_seq when VMAs are created by running mmtest autonuma benchmark briefly looks like: start_seq=0 : 459 start_seq=2 : 138 start_seq=3 : 144 start_seq=4 : 8 start_seq=8 : 1 start_seq=9 : 1 This results in no unconditional PTE updates for those VMAs created after some time. Fix: - Note down the initial value of mm numa_scan_seq in per VMA start_seq. - Allow unconditional scan till start_seq + 2. Result: SUT: AMD EPYC Milan with 2 NUMA nodes 256 cpus. base kernel: upstream 6.6-rc6 with Mels patches [1] applied. kernbench ========== base patched %gain Amean elsp-128 165.09 ( 0.00%) 164.78 * 0.19%* Duration User 41404.28 41375.08 Duration System 9862.22 9768.48 Duration Elapsed 519.87 518.72 Ops NUMA PTE updates 1041416.00 831536.00 Ops NUMA hint faults 263296.00 220966.00 Ops NUMA pages migrated 258021.00 212769.00 Ops AutoNUMA cost 1328.67 1114.69 autonumabench NUMA01_THREADLOCAL ================== Amean elsp-NUMA01_THREADLOCAL 81.79 (0.00%) 67.74 * 17.18%* Duration User 54832.73 47379.67 Duration System 75.00 185.75 Duration Elapsed 576.72 476.09 Ops NUMA PTE updates 394429.00 11121044.00 Ops NUMA hint faults 1001.00 8906404.00 Ops NUMA pages migrated 288.00 2998694.00 Ops AutoNUMA cost 7.77 44666.84 Signed-off-by: Raghavendra K T Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Link: https://lore.kernel.org/r/2ea7cbce80ac7c62e90cbfb9653a7972f902439f.1697816692.git.raghavendra.kt@amd.com --- include/linux/mm_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 957ce38768b2a..950df415d7de9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -600,6 +600,9 @@ struct vma_numab_state { */ unsigned long pids_active[2]; + /* MM scan sequence ID when scan first started after VMA creation */ + int start_scan_seq; + /* * MM scan sequence ID when the VMA was last completely scanned. * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq -- cgit v1.2.3 From 2227a957e1d5b1941be4e4207879ec74f4bb37f8 Mon Sep 17 00:00:00 2001 From: Abel Wu Date: Wed, 15 Nov 2023 11:36:45 +0800 Subject: sched/eevdf: Sort the rbtree by virtual deadline Sort the task timeline by virtual deadline and keep the min_vruntime in the augmented tree, so we can avoid doubling the worst case cost and make full use of the cached leftmost node to enable O(1) fastpath picking in next patch. Signed-off-by: Abel Wu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231115033647.80785-3-wuyun.abel@bytedance.com --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 292c316972485..cd56d40185273 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -553,7 +553,7 @@ struct sched_entity { struct load_weight load; struct rb_node run_node; u64 deadline; - u64 min_deadline; + u64 min_vruntime; struct list_head group_node; unsigned int on_rq; -- cgit v1.2.3 From 5d69eca542ee17c618f9a55da52191d5e28b435f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 4 Nov 2023 11:59:18 +0100 Subject: sched: Unify runtime accounting across classes All classes use sched_entity::exec_start to track runtime and have copies of the exact same code around to compute runtime. Collapse all that. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Valentin Schneider Reviewed-by: Steven Rostedt (Google) Link: https://lkml.kernel.org/r/54d148a144f26d9559698c4dd82d8859038a7380.1699095159.git.bristot@kernel.org --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index cd56d40185273..44b46d9743bfa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -523,7 +523,7 @@ struct sched_statistics { u64 block_max; s64 sum_block_runtime; - u64 exec_max; + s64 exec_max; u64 slice_max; u64 nr_migrations_cold; -- cgit v1.2.3 From 63ba8422f876e32ee564ea95da9a7313b13ff0a1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 4 Nov 2023 11:59:21 +0100 Subject: sched/deadline: Introduce deadline servers Low priority tasks (e.g., SCHED_OTHER) can suffer starvation if tasks with higher priority (e.g., SCHED_FIFO) monopolize CPU(s). RT Throttling has been introduced a while ago as a (mostly debug) countermeasure one can utilize to reserve some CPU time for low priority tasks (usually background type of work, e.g. workqueues, timers, etc.). It however has its own problems (see documentation) and the undesired effect of unconditionally throttling FIFO tasks even when no lower priority activity needs to run (there are mechanisms to fix this issue as well, but, again, with their own problems). Introduce deadline servers to service low priority tasks needs under starvation conditions. Deadline servers are built extending SCHED_DEADLINE implementation to allow 2-level scheduling (a sched_deadline entity becomes a container for lower priority scheduling entities). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/4968601859d920335cf85822eb573a5f179f04b8.1699095159.git.bristot@kernel.org --- include/linux/sched.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 44b46d9743bfa..8d258162deb0a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -63,11 +63,13 @@ struct robust_list_head; struct root_domain; struct rq; struct sched_attr; +struct sched_dl_entity; struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; +struct task_struct; struct user_event_mm; /* @@ -607,6 +609,9 @@ struct sched_rt_entity { #endif } __randomize_layout; +typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *); +typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *); + struct sched_dl_entity { struct rb_node rb_node; @@ -654,6 +659,7 @@ struct sched_dl_entity { unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; + unsigned int dl_server : 1; /* * Bandwidth enforcement timer. Each -deadline task has its @@ -668,7 +674,20 @@ struct sched_dl_entity { * timer is needed to decrease the active utilization at the correct * time. */ - struct hrtimer inactive_timer; + struct hrtimer inactive_timer; + + /* + * Bits for DL-server functionality. Also see the comment near + * dl_server_update(). + * + * @rq the runqueue this server is for + * + * @server_has_tasks() returns true if @server_pick return a + * runnable task. + */ + struct rq *rq; + dl_server_has_tasks_f server_has_tasks; + dl_server_pick_f server_pick; #ifdef CONFIG_RT_MUTEXES /* @@ -795,6 +814,7 @@ struct task_struct { struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; + struct sched_dl_entity *dl_server; const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE -- cgit v1.2.3 From e4ab322fbaaaf84b23d6cb0e3317a7f68baf36dc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 17 Sep 2023 13:22:17 +0200 Subject: cleanup: Add conditional guard support Adds: - DEFINE_GUARD_COND() / DEFINE_LOCK_GUARD_1_COND() to extend existing guards with conditional lock primitives, eg. mutex_trylock(), mutex_lock_interruptible(). nb. both primitives allow NULL 'locks', which cause the lock to fail (obviously). - extends scoped_guard() to not take the body when the the conditional guard 'fails'. eg. scoped_guard (mutex_intr, &task->signal_cred_guard_mutex) { ... } will only execute the body when the mutex is held. - provides scoped_cond_guard(name, fail, args...); which extends scoped_guard() to do fail when the lock-acquire fails. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231102110706.460851167%40infradead.org --- include/linux/cleanup.h | 52 +++++++++++++++++++++++++++++++++++++++++++++--- include/linux/mutex.h | 3 ++- include/linux/rwsem.h | 8 ++++---- include/linux/spinlock.h | 15 ++++++++++++++ 4 files changed, 70 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 9f1a9c455b684..c2d09bc4f9768 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -125,25 +125,55 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ * trivial wrapper around DEFINE_CLASS() above specifically * for locks. * + * DEFINE_GUARD_COND(name, ext, condlock) + * wrapper around EXTEND_CLASS above to add conditional lock + * variants to a base class, eg. mutex_trylock() or + * mutex_lock_interruptible(). + * * guard(name): - * an anonymous instance of the (guard) class + * an anonymous instance of the (guard) class, not recommended for + * conditional locks. * * scoped_guard (name, args...) { }: * similar to CLASS(name, scope)(args), except the variable (with the * explicit name 'scope') is declard in a for-loop such that its scope is * bound to the next (compound) statement. * + * for conditional locks the loop body is skipped when the lock is not + * acquired. + * + * scoped_cond_guard (name, fail, args...) { }: + * similar to scoped_guard(), except it does fail when the lock + * acquire fails. + * */ #define DEFINE_GUARD(_name, _type, _lock, _unlock) \ - DEFINE_CLASS(_name, _type, _unlock, ({ _lock; _T; }), _type _T) + DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \ + static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \ + { return *_T; } + +#define DEFINE_GUARD_COND(_name, _ext, _condlock) \ + EXTEND_CLASS(_name, _ext, \ + ({ void *_t = _T; if (_T && !(_condlock)) _t = NULL; _t; }), \ + class_##_name##_t _T) \ + static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ + { return class_##_name##_lock_ptr(_T); } #define guard(_name) \ CLASS(_name, __UNIQUE_ID(guard)) +#define __guard_ptr(_name) class_##_name##_lock_ptr + #define scoped_guard(_name, args...) \ for (CLASS(_name, scope)(args), \ - *done = NULL; !done; done = (void *)1) + *done = NULL; __guard_ptr(_name)(&scope) && !done; done = (void *)1) + +#define scoped_cond_guard(_name, _fail, args...) \ + for (CLASS(_name, scope)(args), \ + *done = NULL; !done; done = (void *)1) \ + if (!__guard_ptr(_name)(&scope)) _fail; \ + else /* * Additional helper macros for generating lock guards with types, either for @@ -152,6 +182,7 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ * * DEFINE_LOCK_GUARD_0(name, lock, unlock, ...) * DEFINE_LOCK_GUARD_1(name, type, lock, unlock, ...) + * DEFINE_LOCK_GUARD_1_COND(name, ext, condlock) * * will result in the following type: * @@ -173,6 +204,11 @@ typedef struct { \ static inline void class_##_name##_destructor(class_##_name##_t *_T) \ { \ if (_T->lock) { _unlock; } \ +} \ + \ +static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ +{ \ + return _T->lock; \ } @@ -201,4 +237,14 @@ __DEFINE_LOCK_GUARD_1(_name, _type, _lock) __DEFINE_UNLOCK_GUARD(_name, void, _unlock, __VA_ARGS__) \ __DEFINE_LOCK_GUARD_0(_name, _lock) +#define DEFINE_LOCK_GUARD_1_COND(_name, _ext, _condlock) \ + EXTEND_CLASS(_name, _ext, \ + ({ class_##_name##_t _t = { .lock = l }, *_T = &_t;\ + if (_T->lock && !(_condlock)) _T->lock = NULL; \ + _t; }), \ + typeof_member(class_##_name##_t, lock) l) \ + static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ + { return class_##_name##_lock_ptr(_T); } + + #endif /* __LINUX_GUARDS_H */ diff --git a/include/linux/mutex.h b/include/linux/mutex.h index a33aa9eb9fc3b..95d11308f995d 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -221,6 +221,7 @@ extern void mutex_unlock(struct mutex *lock); extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); DEFINE_GUARD(mutex, struct mutex *, mutex_lock(_T), mutex_unlock(_T)) -DEFINE_FREE(mutex, struct mutex *, if (_T) mutex_unlock(_T)) +DEFINE_GUARD_COND(mutex, _try, mutex_trylock(_T)) +DEFINE_GUARD_COND(mutex, _intr, mutex_lock_interruptible(_T) == 0) #endif /* __LINUX_MUTEX_H */ diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 1dd530ce8b45b..9c29689ff505e 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -203,11 +203,11 @@ extern void up_read(struct rw_semaphore *sem); extern void up_write(struct rw_semaphore *sem); DEFINE_GUARD(rwsem_read, struct rw_semaphore *, down_read(_T), up_read(_T)) -DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T)) - -DEFINE_FREE(up_read, struct rw_semaphore *, if (_T) up_read(_T)) -DEFINE_FREE(up_write, struct rw_semaphore *, if (_T) up_write(_T)) +DEFINE_GUARD_COND(rwsem_read, _try, down_read_trylock(_T)) +DEFINE_GUARD_COND(rwsem_read, _intr, down_read_interruptible(_T) == 0) +DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T)) +DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T)) /* * downgrade write lock to read lock diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 31d3d747a9db7..ceb56b39c70f7 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -507,6 +507,8 @@ DEFINE_LOCK_GUARD_1(raw_spinlock, raw_spinlock_t, raw_spin_lock(_T->lock), raw_spin_unlock(_T->lock)) +DEFINE_LOCK_GUARD_1_COND(raw_spinlock, _try, raw_spin_trylock(_T->lock)) + DEFINE_LOCK_GUARD_1(raw_spinlock_nested, raw_spinlock_t, raw_spin_lock_nested(_T->lock, SINGLE_DEPTH_NESTING), raw_spin_unlock(_T->lock)) @@ -515,23 +517,36 @@ DEFINE_LOCK_GUARD_1(raw_spinlock_irq, raw_spinlock_t, raw_spin_lock_irq(_T->lock), raw_spin_unlock_irq(_T->lock)) +DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irq, _try, raw_spin_trylock_irq(_T->lock)) + DEFINE_LOCK_GUARD_1(raw_spinlock_irqsave, raw_spinlock_t, raw_spin_lock_irqsave(_T->lock, _T->flags), raw_spin_unlock_irqrestore(_T->lock, _T->flags), unsigned long flags) +DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irqsave, _try, + raw_spin_trylock_irqsave(_T->lock, _T->flags)) + DEFINE_LOCK_GUARD_1(spinlock, spinlock_t, spin_lock(_T->lock), spin_unlock(_T->lock)) +DEFINE_LOCK_GUARD_1_COND(spinlock, _try, spin_trylock(_T->lock)) + DEFINE_LOCK_GUARD_1(spinlock_irq, spinlock_t, spin_lock_irq(_T->lock), spin_unlock_irq(_T->lock)) +DEFINE_LOCK_GUARD_1_COND(spinlock_irq, _try, + spin_trylock_irq(_T->lock)) + DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t, spin_lock_irqsave(_T->lock, _T->flags), spin_unlock_irqrestore(_T->lock, _T->flags), unsigned long flags) +DEFINE_LOCK_GUARD_1_COND(spinlock_irqsave, _try, + spin_trylock_irqsave(_T->lock, _T->flags)) + #undef __LINUX_INSIDE_SPINLOCK_H #endif /* __LINUX_SPINLOCK_H */ -- cgit v1.2.3 From 4aea6a6d61cd6e3df9ed98345638abad1b1e5276 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Thu, 30 Mar 2023 15:05:38 -0700 Subject: net/mlx5: Query maximum frequency adjustment of the PTP hardware clock Some mlx5 devices do not support the default advertised maximum frequency adjustment value for the PTP hardware clock that is set by the driver. These devices need to be queried when initializing the clock functionality in order to get the maximum supported frequency adjustment value. This value can be greater than the minimum supported frequency adjustment across mlx5 devices (50 million ppb). Signed-off-by: Rahul Rameshbabu Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6f3631425f386..ce2e71cd6d2a3 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10103,7 +10103,10 @@ enum { struct mlx5_ifc_mtutc_reg_bits { u8 reserved_at_0[0x5]; u8 freq_adj_units[0x3]; - u8 reserved_at_8[0x14]; + u8 reserved_at_8[0x3]; + u8 log_max_freq_adjustment[0x5]; + + u8 reserved_at_10[0xc]; u8 operation[0x4]; u8 freq_adjustment[0x20]; -- cgit v1.2.3 From 67420501e8681ae18f9f0ea0a69cd2f432100e70 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 11 Nov 2023 17:05:57 -0800 Subject: bpf: generalize reg_set_min_max() to handle non-const register comparisons Generalize bounds adjustment logic of reg_set_min_max() to handle not just register vs constant case, but in general any register vs any register cases. For most of the operations it's trivial extension based on range vs range comparison logic, we just need to properly pick min/max of a range to compare against min/max of the other range. For BPF_JSET we keep the original capabilities, just make sure JSET is integrated in the common framework. This is manifested in the internal-only BPF_JSET + BPF_X "opcode" to allow for simpler and more uniform rev_opcode() handling. See the code for details. This allows to reuse the same code exactly both for TRUE and FALSE branches without explicitly handling both conditions with custom code. Note also that now we don't need a special handling of BPF_JEQ/BPF_JNE case none of the registers are constants. This is now just a normal generic case handled by reg_set_min_max(). To make tnum handling cleaner, tnum_with_subreg() helper is added, as that's a common operator when dealing with 32-bit subregister bounds. This keeps the overall logic much less noisy when it comes to tnums. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20231112010609.848406-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/tnum.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 1c3948a1d6ad9..3c13240077b87 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -106,6 +106,10 @@ int tnum_sbin(char *str, size_t size, struct tnum a); struct tnum tnum_subreg(struct tnum a); /* Returns the tnum with the lower 32-bit subreg cleared */ struct tnum tnum_clear_subreg(struct tnum a); +/* Returns the tnum with the lower 32-bit subreg in *reg* set to the lower + * 32-bit subreg in *subreg* + */ +struct tnum tnum_with_subreg(struct tnum reg, struct tnum subreg); /* Returns the tnum with the lower 32-bit subreg set to value */ struct tnum tnum_const_subreg(struct tnum a, u32 value); /* Returns true if 32-bit subreg @a is a known constant*/ -- cgit v1.2.3 From 5f99f312bd3bedb3b266b0d26376a8c500cdc97f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 11 Nov 2023 17:06:00 -0800 Subject: bpf: add register bounds sanity checks and sanitization Add simple sanity checks that validate well-formed ranges (min <= max) across u64, s64, u32, and s32 ranges. Also for cases when the value is constant (either 64-bit or 32-bit), we validate that ranges and tnums are in agreement. These bounds checks are performed at the end of BPF_ALU/BPF_ALU64 operations, on conditional jumps, and for LDX instructions (where subreg zero/sign extension is probably the most important to check). This covers most of the interesting cases. Also, we validate the sanity of the return register when manually adjusting it for some special helpers. By default, sanity violation will trigger a warning in verifier log and resetting register bounds to "unbounded" ones. But to aid development and debugging, BPF_F_TEST_SANITY_STRICT flag is added, which will trigger hard failure of verification with -EFAULT on register bounds violations. This allows selftests to catch such issues. veristat will also gain a CLI option to enable this behavior. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20231112010609.848406-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 24213a99cc79d..402b6bc44a1b7 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -602,6 +602,7 @@ struct bpf_verifier_env { int stack_size; /* number of states to be processed */ bool strict_alignment; /* perform strict pointer alignment checks */ bool test_state_freq; /* test verifier with different pruning frequency */ + bool test_sanity_strict; /* fail verification on sanity violations */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_verifier_state_list *free_list; -- cgit v1.2.3 From dfcb264a01a9199e8338a548731baf5bbe77ef19 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sat, 4 Nov 2023 16:49:06 +0100 Subject: power: supply: bq27xxx: Stop and start delayed work in suspend and resume This driver uses delayed work to perform periodic battery state read out. This delayed work is not stopped across suspend and resume cycle. The read out can occur early in the resume cycle. In case of an I2C variant of this hardware, that read out triggers I2C transfer. That I2C transfer may happen while the I2C controller is still suspended, which produces a WARNING in the kernel log. Fix this by introducing trivial PM ops, which stop the delayed work before the system enters suspend, and schedule the delayed work right after the system resumes. Signed-off-by: Marek Vasut Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20231104154920.68585-1-marex@denx.de Signed-off-by: Sebastian Reichel --- include/linux/power/bq27xxx_battery.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h index 7c8d65414a70a..7d8025fb74b70 100644 --- a/include/linux/power/bq27xxx_battery.h +++ b/include/linux/power/bq27xxx_battery.h @@ -83,5 +83,6 @@ struct bq27xxx_device_info { void bq27xxx_battery_update(struct bq27xxx_device_info *di); int bq27xxx_battery_setup(struct bq27xxx_device_info *di); void bq27xxx_battery_teardown(struct bq27xxx_device_info *di); +extern const struct dev_pm_ops bq27xxx_battery_battery_pm_ops; #endif -- cgit v1.2.3 From 74d016ecc1a7974664e98d1afbf649cd4e0e0423 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 15 Nov 2023 22:41:27 -0500 Subject: new helper: user_path_locked_at() Equivalent of kern_path_locked() taking dfd/userland name. User introduced in the next commit. Signed-off-by: Al Viro --- include/linux/namei.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/namei.h b/include/linux/namei.h index 3100371b5e321..74e0cc14ebf86 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -66,6 +66,7 @@ extern struct dentry *kern_path_create(int, const char *, struct path *, unsigne extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); extern void done_path_create(struct path *, struct dentry *); extern struct dentry *kern_path_locked(const char *, struct path *); +extern struct dentry *user_path_locked_at(int , const char __user *, struct path *); int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root); -- cgit v1.2.3 From 3185d57cfcd34fadbe28f4ed57a6cb5122277ece Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 14 Nov 2023 11:42:02 +0100 Subject: indirect_call_wrapper: Fix typo in INDIRECT_CALL_$NR kerneldoc Fix a small typo in the kerneldoc comment of the INDIRECT_CALL_$NR macro. Signed-off-by: Tobias Klauser Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20231114104202.4680-1-tklauser@distanz.ch Signed-off-by: Paolo Abeni --- include/linux/indirect_call_wrapper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h index c1c76a70a6ce9..adb83a42a6b90 100644 --- a/include/linux/indirect_call_wrapper.h +++ b/include/linux/indirect_call_wrapper.h @@ -11,7 +11,7 @@ * @__VA_ARGS__: arguments for @f * * Avoid retpoline overhead for known builtin, checking @f vs each of them and - * eventually invoking directly the builtin function. The functions are check + * eventually invoking directly the builtin function. The functions are checked * in the given order. Fallback to the indirect call. */ #define INDIRECT_CALL_1(f, f1, ...) \ -- cgit v1.2.3 From f7f965c982f7954b46db910146a7ffe0fe1eb5e1 Mon Sep 17 00:00:00 2001 From: Tao Zhang Date: Thu, 28 Sep 2023 14:29:36 +0800 Subject: coresight-tpdm: Introduce TPDM subtype to TPDM driver Introduce the new subtype of "CORESIGHT_DEV_SUBTYPE_SOURCE_TPDM" for TPDM components in driver. Signed-off-by: Tao Zhang Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/1695882586-10306-4-git-send-email-quic_taozha@quicinc.com --- include/linux/coresight.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index a269fffaf991c..a4cb7dd6ca237 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -64,6 +64,7 @@ enum coresight_dev_subtype_source { CORESIGHT_DEV_SUBTYPE_SOURCE_PROC, CORESIGHT_DEV_SUBTYPE_SOURCE_BUS, CORESIGHT_DEV_SUBTYPE_SOURCE_SOFTWARE, + CORESIGHT_DEV_SUBTYPE_SOURCE_TPDM, CORESIGHT_DEV_SUBTYPE_SOURCE_OTHERS, }; -- cgit v1.2.3 From 3fc6350fc8470d42f5e700ecd1c3d90f9dd9fd2d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 13 Nov 2023 13:12:49 +0200 Subject: treewide, spi: Get rid of SPI_MASTER_HALF_DUPLEX The SPI_MASTER_HALF_DUPLEX is the legacy name of a definition for a half duplex flag. Since all others had been replaced with the respective SPI_CONTROLLER prefix get rid of the last one as well. There is no functional change intended. Signed-off-by: Andy Shevchenko Acked-by: Greg Kroah-Hartman Acked-by: Ulf Hansson # For MMC Acked-by: Dmitry Torokhov # for input Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20231113111249.3982461-1-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 255a0562aea5a..7b4baff63c5c0 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -1638,8 +1638,6 @@ spi_transfer_is_last(struct spi_controller *ctlr, struct spi_transfer *xfer) /* Compatibility layer */ #define spi_master spi_controller -#define SPI_MASTER_HALF_DUPLEX SPI_CONTROLLER_HALF_DUPLEX - #define spi_master_get_devdata(_ctlr) spi_controller_get_devdata(_ctlr) #define spi_master_set_devdata(_ctlr, _data) \ spi_controller_set_devdata(_ctlr, _data) -- cgit v1.2.3 From fac4a535758851215d23d7d92879aeee5035f51d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 8 Aug 2023 19:27:56 +0300 Subject: device property: Add fwnode_property_match_property_string() Sometimes the users want to match the single value string property against an array of predefined strings. Create a helper for them. Signed-off-by: Andy Shevchenko Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20230808162800.61651-3-andriy.shevchenko@linux.intel.com Signed-off-by: Jonathan Cameron --- include/linux/property.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 9f2585d705a86..2b8f07fc68a97 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -98,6 +98,18 @@ static inline bool device_is_compatible(const struct device *dev, const char *co return fwnode_device_is_compatible(dev_fwnode(dev), compat); } +int fwnode_property_match_property_string(const struct fwnode_handle *fwnode, + const char *propname, + const char * const *array, size_t n); + +static inline +int device_property_match_property_string(const struct device *dev, + const char *propname, + const char * const *array, size_t n) +{ + return fwnode_property_match_property_string(dev_fwnode(dev), propname, array, n); +} + int fwnode_property_get_reference_args(const struct fwnode_handle *fwnode, const char *prop, const char *nargs_prop, unsigned int nargs, unsigned int index, -- cgit v1.2.3 From 96fa96e198f9707285003075fbbce7db6a485112 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 15 Nov 2023 11:39:18 +0000 Subject: net: linkmode: add linkmode_fill() helper Add a linkmode_fill() helper, which will allow us to convert phylink's open coded bitmap_fill() operations. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/linkmode.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h index 7303b4bc2ce01..287f590ed56be 100644 --- a/include/linux/linkmode.h +++ b/include/linux/linkmode.h @@ -10,6 +10,11 @@ static inline void linkmode_zero(unsigned long *dst) bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS); } +static inline void linkmode_fill(unsigned long *dst) +{ + bitmap_fill(dst, __ETHTOOL_LINK_MODE_MASK_NBITS); +} + static inline void linkmode_copy(unsigned long *dst, const unsigned long *src) { bitmap_copy(dst, src, __ETHTOOL_LINK_MODE_MASK_NBITS); -- cgit v1.2.3 From a9214a8883ceb82df55aa90d1c49ddb85fc1e3d5 Mon Sep 17 00:00:00 2001 From: Etienne Carriere Date: Mon, 30 Oct 2023 09:48:10 +0100 Subject: tee: system session Adds kernel client API function tee_client_system_session() for a client to request a system service entry in TEE context. This feature is needed to prevent a system deadlock when several TEE client applications invoke TEE, consuming all TEE thread contexts available in the secure world. The deadlock can happen in the OP-TEE driver for example if all these TEE threads issue an RPC call from TEE to Linux OS to access an eMMC RPMB partition (TEE secure storage) which device clock or regulator controller is accessed through an OP-TEE SCMI services. In that case, Linux SCMI driver must reach OP-TEE SCMI service without waiting until one of the consumed TEE threads is freed. Reviewed-by: Sumit Garg Co-developed-by: Jens Wiklander Signed-off-by: Etienne Carriere Signed-off-by: Jens Wiklander --- include/linux/tee_drv.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index 17eb1c5205d34..911ddf92dcee7 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -84,6 +84,7 @@ struct tee_param { * @release: release this open file * @open_session: open a new session * @close_session: close a session + * @system_session: declare session as a system session * @invoke_func: invoke a trusted function * @cancel_req: request cancel of an ongoing invoke or open * @supp_recv: called for supplicant to get a command @@ -100,6 +101,7 @@ struct tee_driver_ops { struct tee_ioctl_open_session_arg *arg, struct tee_param *param); int (*close_session)(struct tee_context *ctx, u32 session); + int (*system_session)(struct tee_context *ctx, u32 session); int (*invoke_func)(struct tee_context *ctx, struct tee_ioctl_invoke_arg *arg, struct tee_param *param); @@ -429,6 +431,20 @@ int tee_client_open_session(struct tee_context *ctx, */ int tee_client_close_session(struct tee_context *ctx, u32 session); +/** + * tee_client_system_session() - Declare session as a system session + * @ctx: TEE Context + * @session: Session id + * + * This function requests TEE to provision an entry context ready to use for + * that session only. The provisioned entry context is used for command + * invocation and session closure, not for command cancelling requests. + * TEE releases the provisioned context upon session closure. + * + * Return < 0 on error else 0 if an entry context has been provisioned. + */ +int tee_client_system_session(struct tee_context *ctx, u32 session); + /** * tee_client_invoke_func() - Invoke a function in a Trusted Application * @ctx: TEE Context -- cgit v1.2.3 From 5e4166461cf66a26f925011d90017da74e410747 Mon Sep 17 00:00:00 2001 From: Yang Hubin Date: Sat, 4 Nov 2023 00:45:01 -0700 Subject: f2fs: the name of a struct is wrong in a comment. The macro SUMMARY_SIZE represents the size of the struct f2fs_summary, instead of the size of the struct summary. Signed-off-by: Yang Hubin Signed-off-by: Qian Haolai Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 039fe0ce8d83d..053137a0fe456 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -441,7 +441,7 @@ struct f2fs_sit_block { * ex) data_blkaddr = (block_t)(nodepage_start_address + ofs_in_node) */ #define ENTRIES_IN_SUM (F2FS_BLKSIZE / 8) -#define SUMMARY_SIZE (7) /* sizeof(struct summary) */ +#define SUMMARY_SIZE (7) /* sizeof(struct f2fs_summary) */ #define SUM_FOOTER_SIZE (5) /* sizeof(struct summary_footer) */ #define SUM_ENTRY_SIZE (SUMMARY_SIZE * ENTRIES_IN_SUM) -- cgit v1.2.3 From ff8867af01daa7ea770bebf5f91199b7434b74e5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 17 Nov 2023 09:14:04 -0800 Subject: bpf: rename BPF_F_TEST_SANITY_STRICT to BPF_F_TEST_REG_INVARIANTS Rename verifier internal flag BPF_F_TEST_SANITY_STRICT to more neutral BPF_F_TEST_REG_INVARIANTS. This is a follow up to [0]. A few selftests and veristat need to be adjusted in the same patch as well. [0] https://patchwork.kernel.org/project/netdevbpf/patch/20231112010609.848406-5-andrii@kernel.org/ Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231117171404.225508-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 402b6bc44a1b7..52a4012b82555 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -602,7 +602,7 @@ struct bpf_verifier_env { int stack_size; /* number of states to be processed */ bool strict_alignment; /* perform strict pointer alignment checks */ bool test_state_freq; /* test verifier with different pruning frequency */ - bool test_sanity_strict; /* fail verification on sanity violations */ + bool test_reg_invariants; /* fail verification on register invariants violations */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_verifier_state_list *free_list; -- cgit v1.2.3 From 2eea9ce4310d8c0f8ef1dbe7b0e7d9219ff02b97 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 25 Oct 2023 16:02:00 +0200 Subject: mounts: keep list of mounts in an rbtree When adding a mount to a namespace insert it into an rbtree rooted in the mnt_namespace instead of a linear list. The mnt.mnt_list is still used to set up the mount tree and for propagation, but not after the mount has been added to a namespace. Hence mnt_list can live in union with rb_node. Use MNT_ONRB mount flag to validate that the mount is on the correct list. This allows removing the cursor used for reading /proc/$PID/mountinfo. The mnt_id_unique of the next mount can be used as an index into the seq file. Tested by inserting 100k bind mounts, unsharing the mount namespace, and unmounting. No performance regressions have been observed. For the last mount in the 100k list the statmount() call was more than 100x faster due to the mount ID lookup not having to do a linear search. This patch makes the overhead of mount ID lookup non-observable in this range. Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20231025140205.3586473-3-mszeredi@redhat.com Reviewed-by: Ian Kent Signed-off-by: Christian Brauner --- include/linux/mount.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mount.h b/include/linux/mount.h index ac3dd28761978..c34c18b4e8f36 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -50,8 +50,7 @@ struct path; #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ - MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \ - MNT_CURSOR) + MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | MNT_ONRB) #define MNT_INTERNAL 0x4000 @@ -65,7 +64,7 @@ struct path; #define MNT_SYNC_UMOUNT 0x2000000 #define MNT_MARKED 0x4000000 #define MNT_UMOUNT 0x8000000 -#define MNT_CURSOR 0x10000000 +#define MNT_ONRB 0x10000000 struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ -- cgit v1.2.3 From 982c3b3058433f20aba9fb032599cee5dfc17328 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Oct 2023 15:01:08 +0200 Subject: bdev: rename freeze and thaw helpers We have bdev_mark_dead() etc and we're going to move block device freezing to holder ops in the next patch. Make the naming consistent: * freeze_bdev() -> bdev_freeze() * thaw_bdev() -> bdev_thaw() Also document the return code. Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-2-599c19f4faac@kernel.org Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 51fa7ffdee83b..7a3da7f44afb7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1541,8 +1541,8 @@ static inline int early_lookup_bdev(const char *pathname, dev_t *dev) } #endif /* CONFIG_BLOCK */ -int freeze_bdev(struct block_device *bdev); -int thaw_bdev(struct block_device *bdev); +int bdev_freeze(struct block_device *bdev); +int bdev_thaw(struct block_device *bdev); struct io_comp_batch { struct request *req_list; -- cgit v1.2.3 From a30561a9be69d446d8d542a4f9735fe5ca9573df Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Oct 2023 15:01:10 +0200 Subject: bdev: add freeze and thaw holder operations Add block device freeze and thaw holder operations. Follow-up patches will implement block device freeze and thaw based on stuct blk_holder_ops. Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-4-599c19f4faac@kernel.org Reviewed-by: Darrick J. Wong Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/blkdev.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7a3da7f44afb7..1bc776335ff89 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1468,6 +1468,16 @@ struct blk_holder_ops { * Sync the file system mounted on the block device. */ void (*sync)(struct block_device *bdev); + + /* + * Freeze the file system mounted on the block device. + */ + int (*freeze)(struct block_device *bdev); + + /* + * Thaw the file system mounted on the block device. + */ + int (*thaw)(struct block_device *bdev); }; extern const struct blk_holder_ops fs_holder_ops; -- cgit v1.2.3 From 49ef8832fb1a9e0da0020eb17480fd286433bc13 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 27 Sep 2023 15:21:16 +0200 Subject: bdev: implement freeze and thaw holder operations The old method of implementing block device freeze and thaw operations required us to rely on get_active_super() to walk the list of all superblocks on the system to find any superblock that might use the block device. This is wasteful and not very pleasant overall. Now that we can finally go straight from block device to owning superblock things become way simpler. Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-5-599c19f4faac@kernel.org Reviewed-by: Darrick J. Wong Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/blk_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d5c5e59ddbd25..88e1848b08694 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -57,7 +57,7 @@ struct block_device { const struct blk_holder_ops *bd_holder_ops; struct mutex bd_holder_lock; /* The counter of freeze processes */ - int bd_fsfreeze_count; + atomic_t bd_fsfreeze_count; int bd_holders; struct kobject *bd_holder_dir; -- cgit v1.2.3 From 434f8d8299f2a0c97578f77ab23a70cd0ae56544 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Oct 2023 15:01:12 +0200 Subject: fs: remove get_active_super() This function is now unused so remove it. One less function that uses the global superblock list. Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-6-599c19f4faac@kernel.org Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e3..7dc6c1bf5f553 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3121,7 +3121,6 @@ extern int vfs_readlink(struct dentry *, char __user *, int); extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); -extern struct super_block *get_active_super(struct block_device *bdev); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); extern void iterate_supers(void (*)(struct super_block *, void *), void *); -- cgit v1.2.3 From 90f95dc415de23267b888f8238c4a19fa0f66b89 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Oct 2023 15:01:13 +0200 Subject: super: remove bd_fsfreeze_sb Remove bd_fsfreeze_sb as it's now unused and can be removed. Also move bd_fsfreeze_count down to not have it weirdly placed in the middle of the holder fields. Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-7-599c19f4faac@kernel.org Reviewed-by: Darrick J. Wong Reviewed-by: Jan Kara Suggested-by: Jan Kara Suggested-by: Christoph Hellwig Signed-off-by: Christian Brauner --- include/linux/blk_types.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 88e1848b08694..749203277feed 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -56,14 +56,11 @@ struct block_device { void * bd_holder; const struct blk_holder_ops *bd_holder_ops; struct mutex bd_holder_lock; - /* The counter of freeze processes */ - atomic_t bd_fsfreeze_count; int bd_holders; struct kobject *bd_holder_dir; - /* Mutex for freeze */ - struct mutex bd_fsfreeze_mutex; - struct super_block *bd_fsfreeze_sb; + atomic_t bd_fsfreeze_count; /* number of freeze requests */ + struct mutex bd_fsfreeze_mutex; /* serialize freeze/thaw */ struct partition_meta_info *bd_meta_info; #ifdef CONFIG_FAIL_MAKE_REQUEST -- cgit v1.2.3 From e419cf3ebaee694a826ddcfb350f1b1ebaf1e599 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Oct 2023 15:01:16 +0200 Subject: blkdev: comment fs_holder_ops Add a comment to @fs_holder_ops that @holder must point to a superblock. Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-10-599c19f4faac@kernel.org Reviewed-by: Darrick J. Wong Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- include/linux/blkdev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1bc776335ff89..abf71cce785c2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1480,6 +1480,11 @@ struct blk_holder_ops { int (*thaw)(struct block_device *bdev); }; +/* + * For filesystems using @fs_holder_ops, the @holder argument passed to + * helpers used to open and claim block devices via + * bd_prepare_to_claim() must point to a superblock. + */ extern const struct blk_holder_ops fs_holder_ops; /* -- cgit v1.2.3 From 7366f8b6fc6aa21c4199cb5d337b023df69745b0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 4 Nov 2023 15:00:13 +0100 Subject: fs: handle freezing from multiple devices Before [1] freezing a filesystems through the block layer only worked for the main block device as the owning superblock of additional block devices could not be found. Any filesystem that made use of multiple block devices would only be freezable via it's main block device. For example, consider xfs over device mapper with /dev/dm-0 as main block device and /dev/dm-1 as external log device. Two freeze requests before [1]: (1) dmsetup suspend /dev/dm-0 on the main block device bdev_freeze(dm-0) -> dm-0->bd_fsfreeze_count++ -> freeze_super(xfs-sb) The owning superblock is found and the filesystem gets frozen. Returns 0. (2) dmsetup suspend /dev/dm-1 on the log device bdev_freeze(dm-1) -> dm-1->bd_fsfreeze_count++ The owning superblock isn't found and only the block device freeze count is incremented. Returns 0. Two freeze requests after [1]: (1') dmsetup suspend /dev/dm-0 on the main block device bdev_freeze(dm-0) -> dm-0->bd_fsfreeze_count++ -> freeze_super(xfs-sb) The owning superblock is found and the filesystem gets frozen. Returns 0. (2') dmsetup suspend /dev/dm-1 on the log device bdev_freeze(dm-0) -> dm-0->bd_fsfreeze_count++ -> freeze_super(xfs-sb) The owning superblock is found and the filesystem gets frozen. Returns -EBUSY. When (2') is called we initiate a freeze from another block device of the same superblock. So we increment the bd_fsfreeze_count for that additional block device. But we now also find the owning superblock for additional block devices and call freeze_super() again which reports -EBUSY. This can be reproduced through xfstests via: mkfs.xfs -f -m crc=1,reflink=1,rmapbt=1, -i sparse=1 -lsize=1g,logdev=/dev/nvme1n1p4 /dev/nvme1n1p3 mkfs.xfs -f -m crc=1,reflink=1,rmapbt=1, -i sparse=1 -lsize=1g,logdev=/dev/nvme1n1p6 /dev/nvme1n1p5 FSTYP=xfs export TEST_DEV=/dev/nvme1n1p3 export TEST_DIR=/mnt/test export TEST_LOGDEV=/dev/nvme1n1p4 export SCRATCH_DEV=/dev/nvme1n1p5 export SCRATCH_MNT=/mnt/scratch export SCRATCH_LOGDEV=/dev/nvme1n1p6 export USE_EXTERNAL=yes sudo ./check generic/311 Current semantics allow two concurrent freezers: one initiated from userspace via FREEZE_HOLDER_USERSPACE and one initiated from the kernel via FREEZE_HOLDER_KERNEL. If there are multiple concurrent freeze requests from either FREEZE_HOLDER_USERSPACE or FREEZE_HOLDER_KERNEL -EBUSY is returned. We need to preserve these semantics because as they are uapi via FIFREEZE and FITHAW ioctl()s. IOW, freezes don't nest for FIFREEZE and FITHAW. Other kernels consumers rely on non-nesting freezes as well. With freezes initiated from the block layer freezes need to nest if the same superblock is frozen via multiple devices. So we need to start counting the number of freeze requests. If FREEZE_MAY_NEST is passed alongside FREEZE_HOLDER_KERNEL or FREEZE_HOLDER_USERSPACE we allow the caller to nest freeze calls. To accommodate the old semantics we split the freeze counter into two counting kernel initiated and userspace initiated freezes separately. We can then also stop recording FREEZE_HOLDER_* in struct sb_writers. We also simplify freezing by making all concurrent freezers share a single active superblock reference count instead of having separate references for kernel and userspace. I don't see why we would need two active reference counts. Neither FREEZE_HOLDER_KERNEL nor FREEZE_HOLDER_USERSPACE can put the active reference as long as they are concurrent freezers anwyay. That was already true before we allowed nesting freezes. Survives various fstests runs with different options including the reproducer, online scrub, and online repair, fsfreze, and so on. Also survives blktests. Link: https://lore.kernel.org/linux-block/87bkccnwxc.fsf@debian-BULLSEYE-live-builder-AMD64 Link: https://lore.kernel.org/r/20231104-vfs-multi-device-freeze-v2-2-5b5b69626eac@kernel.org Fixes: 288d8706abfc ("bdev: implement freeze and thaw holder operations") [1] # no backport needed Tested-by: Chandan Babu R Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Reported-by: Chandan Babu R Signed-off-by: Christian Brauner --- include/linux/fs.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7dc6c1bf5f553..b2a3f1c61c192 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1185,7 +1185,8 @@ enum { struct sb_writers { unsigned short frozen; /* Is sb frozen? */ - unsigned short freeze_holders; /* Who froze fs? */ + int freeze_kcount; /* How many kernel freeze requests? */ + int freeze_ucount; /* How many userspace freeze requests? */ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; @@ -2051,9 +2052,24 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags); +/** + * enum freeze_holder - holder of the freeze + * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem + * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem + * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed + * + * Indicate who the owner of the freeze or thaw request is and whether + * the freeze needs to be exclusive or can nest. + * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the + * same holder aren't allowed. It is however allowed to hold a single + * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at + * the same time. This is relied upon by some filesystems during online + * repair or similar. + */ enum freeze_holder { FREEZE_HOLDER_KERNEL = (1U << 0), FREEZE_HOLDER_USERSPACE = (1U << 1), + FREEZE_MAY_NEST = (1U << 2), }; struct super_operations { -- cgit v1.2.3 From cd34758c5238ae6976b10fe15bba7031b409c969 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2023 18:43:07 +0100 Subject: block: Remove blkdev_get_by_*() functions blkdev_get_by_*() and blkdev_put() functions are now unused. Remove them. Acked-by: Christoph Hellwig Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20231101174325.10596-2-jack@suse.cz Reviewed-by: Christian Brauner Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/blkdev.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index abf71cce785c2..7afc10315dd5a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1500,10 +1500,6 @@ struct bdev_handle { blk_mode_t mode; }; -struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder, - const struct blk_holder_ops *hops); -struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode, - void *holder, const struct blk_holder_ops *hops); struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, @@ -1511,7 +1507,6 @@ struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, int bd_prepare_to_claim(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops); void bd_abort_claiming(struct block_device *bdev, void *holder); -void blkdev_put(struct block_device *bdev, void *holder); void bdev_release(struct bdev_handle *handle); /* just for blk-cgroup, don't use elsewhere */ -- cgit v1.2.3 From ed5cc702d311c14b653323d76062b0294effa66e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2023 18:43:08 +0100 Subject: block: Add config option to not allow writing to mounted devices Writing to mounted devices is dangerous and can lead to filesystem corruption as well as crashes. Furthermore syzbot comes with more and more involved examples how to corrupt block device under a mounted filesystem leading to kernel crashes and reports we can do nothing about. Add tracking of writers to each block device and a kernel cmdline argument which controls whether other writeable opens to block devices open with BLK_OPEN_RESTRICT_WRITES flag are allowed. We will make filesystems use this flag for used devices. Note that this effectively only prevents modification of the particular block device's page cache by other writers. The actual device content can still be modified by other means - e.g. by issuing direct scsi commands, by doing writes through devices lower in the storage stack (e.g. in case loop devices, DM, or MD are involved) etc. But blocking direct modifications of the block device page cache is enough to give filesystems a chance to perform data validation when loading data from the underlying storage and thus prevent kernel crashes. Syzbot can use this cmdline argument option to avoid uninteresting crashes. Also users whose userspace setup does not need writing to mounted block devices can set this option for hardening. Link: https://lore.kernel.org/all/60788e5d-5c7c-1142-e554-c21d709acfd9@linaro.org Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20231101174325.10596-3-jack@suse.cz Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/blk_types.h | 1 + include/linux/blkdev.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 749203277feed..52e264d5a8303 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -66,6 +66,7 @@ struct block_device { #ifdef CONFIG_FAIL_MAKE_REQUEST bool bd_make_it_fail; #endif + int bd_writers; /* * keep this out-of-line as it's both big and not needed in the fast * path diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7afc10315dd5a..0e0c0186aa321 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -124,6 +124,8 @@ typedef unsigned int __bitwise blk_mode_t; #define BLK_OPEN_NDELAY ((__force blk_mode_t)(1 << 3)) /* open for "writes" only for ioctls (specialy hack for floppy.c) */ #define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4)) +/* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */ +#define BLK_OPEN_RESTRICT_WRITES ((__force blk_mode_t)(1 << 5)) struct gendisk { /* -- cgit v1.2.3 From 6f861765464f43a71462d52026fbddfc858239a5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Nov 2023 18:43:10 +0100 Subject: fs: Block writes to mounted block devices Ask block layer to block writes to block devices mounted by filesystems. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20231101174325.10596-5-jack@suse.cz Reviewed-by: Christian Brauner Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/blkdev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0e0c0186aa321..9f6c3373f9fc5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1494,7 +1494,8 @@ extern const struct blk_holder_ops fs_holder_ops; * as stored in sb->s_flags. */ #define sb_open_mode(flags) \ - (BLK_OPEN_READ | (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE)) + (BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES | \ + (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE)) struct bdev_handle { struct block_device *bdev; -- cgit v1.2.3 From db3db63b1d17c98f69e894edaa2b0b364ecde7a9 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sat, 4 Nov 2023 23:11:17 +0100 Subject: vfs: remove a redundant might_sleep in wait_on_inode wait_on_bit already does it. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20231104221117.2584708-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/writeback.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 083387c00f0c8..6d0a14f7019d1 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -193,7 +193,6 @@ void inode_io_list_del(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) { - might_sleep(); wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE); } -- cgit v1.2.3 From 446e2305827b76e8081057ce56bbd2703b4da8a9 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:29 +0100 Subject: net: Convert PHYs hwtstamp callback to use kernel_hwtstamp_config The PHYs hwtstamp callback are still getting the timestamp config from ifreq and using copy_from/to_user. Get rid of these functions by using timestamp configuration in parameter. This also allow to move on to kernel_hwtstamp_config and be similar to net devices using the new ndo_hwstamp_get/set. This adds the possibility to manipulate the timestamp configuration from the kernel which was not possible with the copy_from/to_user. Signed-off-by: Kory Maincent Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/mii_timestamper.h | 4 +++- include/linux/phy.h | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mii_timestamper.h b/include/linux/mii_timestamper.h index fa940bbaf8ae4..26b04f73f214b 100644 --- a/include/linux/mii_timestamper.h +++ b/include/linux/mii_timestamper.h @@ -9,6 +9,7 @@ #include #include #include +#include struct phy_device; @@ -51,7 +52,8 @@ struct mii_timestamper { struct sk_buff *skb, int type); int (*hwtstamp)(struct mii_timestamper *mii_ts, - struct ifreq *ifreq); + struct kernel_hwtstamp_config *kernel_config, + struct netlink_ext_ack *extack); void (*link_state)(struct mii_timestamper *mii_ts, struct phy_device *phydev); diff --git a/include/linux/phy.h b/include/linux/phy.h index 3cc52826f18e9..e5f1f41e399c7 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1560,9 +1560,11 @@ static inline bool phy_has_txtstamp(struct phy_device *phydev) return phydev && phydev->mii_ts && phydev->mii_ts->txtstamp; } -static inline int phy_hwtstamp(struct phy_device *phydev, struct ifreq *ifr) +static inline int phy_hwtstamp(struct phy_device *phydev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) { - return phydev->mii_ts->hwtstamp(phydev->mii_ts, ifr); + return phydev->mii_ts->hwtstamp(phydev->mii_ts, cfg, extack); } static inline bool phy_rxtstamp(struct phy_device *phydev, struct sk_buff *skb, -- cgit v1.2.3 From b8768dc4077712915f045ba1b198f521493c7914 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 14 Nov 2023 12:28:31 +0100 Subject: net: ethtool: Refactor identical get_ts_info implementations. The vlan, macvlan and the bonding drivers call their "real" device driver in order to report the time stamping capabilities. Provide a core ethtool helper function to avoid copy/paste in the stack. Signed-off-by: Richard Cochran Signed-off-by: Kory Maincent Reviewed-by: Florian Fainelli Reviewed-by: Jay Vosburgh Signed-off-by: David S. Miller --- include/linux/ethtool.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 689028257fccb..c2bb74143edab 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1043,6 +1043,14 @@ static inline int ethtool_mm_frag_size_min_to_add(u32 val_min, u32 *val_add, return -EINVAL; } +/** + * ethtool_get_ts_info_by_layer - Obtains time stamping capabilities from the MAC or PHY layer. + * @dev: pointer to net_device structure + * @info: buffer to hold the result + * Returns zero on success, non-zero otherwise. + */ +int ethtool_get_ts_info_by_layer(struct net_device *dev, struct ethtool_ts_info *info); + /** * ethtool_sprintf - Write formatted string to ethtool string data * @data: Pointer to a pointer to the start of string to update -- cgit v1.2.3 From 011dd3b3f83f9c89605c640424e05845b84f2dad Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:33 +0100 Subject: net: Make dev_set_hwtstamp_phylib accessible Make the dev_set_hwtstamp_phylib function accessible in prevision to use it from ethtool to reset the tstamp current configuration. Reviewed-by: Florian Fainelli Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a16c9cc063fe0..2d840d7056f20 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3942,6 +3942,9 @@ int generic_hwtstamp_get_lower(struct net_device *dev, int generic_hwtstamp_set_lower(struct net_device *dev, struct kernel_hwtstamp_config *kernel_cfg, struct netlink_ext_ack *extack); +int dev_set_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack); int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *userdata); unsigned int dev_get_flags(const struct net_device *); int __dev_change_flags(struct net_device *dev, unsigned int flags, -- cgit v1.2.3 From 51bdf3165f012827644c474a6d905baa3de3f1ea Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:40 +0100 Subject: net: Replace hwtstamp_source by timestamping layer Replace hwtstamp_source which is only used by the kernel_hwtstamp_config structure by the more widely use timestamp_layer structure. This is done to prepare the support of selectable timestamping source. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- include/linux/net_tstamp.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h index eb01c37e71e0a..bb289c2ad3762 100644 --- a/include/linux/net_tstamp.h +++ b/include/linux/net_tstamp.h @@ -5,11 +5,6 @@ #include -enum hwtstamp_source { - HWTSTAMP_SOURCE_NETDEV, - HWTSTAMP_SOURCE_PHYLIB, -}; - /** * struct kernel_hwtstamp_config - Kernel copy of struct hwtstamp_config * @@ -20,8 +15,8 @@ enum hwtstamp_source { * a legacy implementation of a lower driver * @copied_to_user: request was passed to a legacy implementation which already * copied the ioctl request back to user space - * @source: indication whether timestamps should come from the netdev or from - * an attached phylib PHY + * @source: indication whether timestamps should come from software, the netdev + * or from an attached phylib PHY * * Prefer using this structure for in-kernel processing of hardware * timestamping configuration, over the inextensible struct hwtstamp_config @@ -33,7 +28,7 @@ struct kernel_hwtstamp_config { int rx_filter; struct ifreq *ifr; bool copied_to_user; - enum hwtstamp_source source; + enum timestamping_layer source; }; static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kernel_cfg, -- cgit v1.2.3 From 0f7f463d4821a4f52fa5c0a961389e651d50c384 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:41 +0100 Subject: net: Change the API of PHY default timestamp to MAC Change the API to select MAC default time stamping instead of the PHY. Indeed the PHY is closer to the wire therefore theoretically it has less delay than the MAC timestamping but the reality is different. Due to lower time stamping clock frequency, latency in the MDIO bus and no PHC hardware synchronization between different PHY, the PHY PTP is often less precise than the MAC. The exception is for PHY designed specially for PTP case but these devices are not very widespread. For not breaking the compatibility I introduce a default_timestamp flag in phy_device that is set by the phy driver to know we are using the old API behavior. The phy_set_timestamp function is called at each call of phy_attach_direct. In case of MAC driver using phylink this function is called when the interface is turned up. Then if the interface goes down and up again the last choice of timestamp will be overwritten by the default choice. A solution could be to cache the timestamp status but it can bring other issues. In case of SFP, if we change the module, it doesn't make sense to blindly re-set the timestamp back to PHY, if the new module has a PHY with mediocre timestamping capabilities. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ include/linux/phy.h | 4 ++++ 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2d840d7056f20..f020d2790c12f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -2074,6 +2075,8 @@ enum netdev_ml_priv_type { * * @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem, * where the clock is recovered. + * @ts_layer: Tracks which network device + * performs packet time stamping. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2435,6 +2438,8 @@ struct net_device { #if IS_ENABLED(CONFIG_DPLL) struct dpll_pin *dpll_pin; #endif + + enum timestamping_layer ts_layer; }; #define to_net_dev(d) container_of(d, struct net_device, dev) diff --git a/include/linux/phy.h b/include/linux/phy.h index e5f1f41e399c7..317def2a7843e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -604,6 +604,8 @@ struct macsec_ops; * handling shall be postponed until PHY has resumed * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended, * requiring a rerun of the interrupt handler after resume + * @default_timestamp: Flag indicating whether we are using the phy + * timestamp as the default one * @interface: enum phy_interface_t value * @skb: Netlink message for cable diagnostics * @nest: Netlink nest used for cable diagnostics @@ -667,6 +669,8 @@ struct phy_device { unsigned irq_suspended:1; unsigned irq_rerun:1; + unsigned default_timestamp:1; + int rate_matching; enum phy_state state; -- cgit v1.2.3 From db840d389bad60ce6f3aadc1079da13e7e993a16 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 17 Nov 2023 19:46:16 -0800 Subject: bpf: move verbose_linfo() into kernel/bpf/log.c verifier.c is huge. Let's try to move out parts that are logging-related into log.c, as we previously did with bpf_log() and other related stuff. This patch moves line info verbose output routines: it's pretty self-contained and isolated code, so there is no problem with this. Acked-by: Eduard Zingerman Acked-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231118034623.3320920-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 52a4012b82555..d896f3db6a223 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -680,6 +680,10 @@ int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos); int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual); +__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, + u32 insn_off, + const char *prefix_fmt, ...); + static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { struct bpf_verifier_state *cur = env->cur_state; -- cgit v1.2.3 From 42feb6620accded89cad5f455665e21281813d79 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 17 Nov 2023 19:46:17 -0800 Subject: bpf: move verifier state printing code to kernel/bpf/log.c Move a good chunk of code from verifier.c to log.c: verifier state verbose printing logic. This is an important and very much logging/debugging oriented code. It fits the overlall log.c's focus on verifier logging, and moving it allows to keep growing it without unnecessarily adding to verifier.c code that otherwise contains a core verification logic. There are not many shared dependencies between this code and the rest of verifier.c code, except a few single-line helpers for various register type checks and a bit of state "scratching" helpers. We move all such trivial helpers into include/bpf/bpf_verifier.h as static inlines. No functional changes in this patch. Acked-by: Eduard Zingerman Acked-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231118034623.3320920-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 72 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d896f3db6a223..39edc76f436e2 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -783,4 +783,76 @@ static inline bool bpf_type_has_unsafe_modifiers(u32 type) return type_flag(type) & ~BPF_REG_TRUSTED_MODIFIERS; } +static inline bool type_is_ptr_alloc_obj(u32 type) +{ + return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC; +} + +static inline bool type_is_non_owning_ref(u32 type) +{ + return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF; +} + +static inline bool type_is_pkt_pointer(enum bpf_reg_type type) +{ + type = base_type(type); + return type == PTR_TO_PACKET || + type == PTR_TO_PACKET_META; +} + +static inline bool type_is_sk_pointer(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET || + type == PTR_TO_SOCK_COMMON || + type == PTR_TO_TCP_SOCK || + type == PTR_TO_XDP_SOCK; +} + +static inline void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) +{ + env->scratched_regs |= 1U << regno; +} + +static inline void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi) +{ + env->scratched_stack_slots |= 1ULL << spi; +} + +static inline bool reg_scratched(const struct bpf_verifier_env *env, u32 regno) +{ + return (env->scratched_regs >> regno) & 1; +} + +static inline bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno) +{ + return (env->scratched_stack_slots >> regno) & 1; +} + +static inline bool verifier_state_scratched(const struct bpf_verifier_env *env) +{ + return env->scratched_regs || env->scratched_stack_slots; +} + +static inline void mark_verifier_state_clean(struct bpf_verifier_env *env) +{ + env->scratched_regs = 0U; + env->scratched_stack_slots = 0ULL; +} + +/* Used for printing the entire verifier state. */ +static inline void mark_verifier_state_scratched(struct bpf_verifier_env *env) +{ + env->scratched_regs = ~0U; + env->scratched_stack_slots = ~0ULL; +} + +const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type); +const char *dynptr_type_str(enum bpf_dynptr_type type); +const char *iter_type_str(const struct btf *btf, u32 btf_id); +const char *iter_state_str(enum bpf_iter_state state); + +void print_verifier_state(struct bpf_verifier_env *env, + const struct bpf_func_state *state, bool print_all); +void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state); + #endif /* _LINUX_BPF_VERIFIER_H */ -- cgit v1.2.3 From acfde6e8abee6b23e53b08606f861d9124288030 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 4 Nov 2023 00:10:18 -0400 Subject: struct dentry: get rid of randomize_layout idiocy This is beyond ridiculous. There is a reason why that thing is cacheline-aligned... Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/dcache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 3da2f0545d5d7..1d9f7f1320553 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -111,7 +111,7 @@ struct dentry { struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ struct rcu_head d_rcu; } d_u; -} __randomize_layout; +}; /* * dentry->d_lock spinlock nesting subclasses: -- cgit v1.2.3 From 641c3ef5cb68a1426d42e6d3aba16db9bdfbe94f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 10 Nov 2023 12:46:30 -0500 Subject: DCACHE_... ->d_flags bits: switch to BIT() For bits 20..22 (inode type cached in ->d_flags) turn the definitions into expressions like (5 << 20); everything else turns into straight use of BIT() Signed-off-by: Al Viro --- include/linux/dcache.h | 76 +++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 1d9f7f1320553..d9c314cc93b82 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -151,13 +151,13 @@ struct dentry_operations { */ /* d_flags entries */ -#define DCACHE_OP_HASH 0x00000001 -#define DCACHE_OP_COMPARE 0x00000002 -#define DCACHE_OP_REVALIDATE 0x00000004 -#define DCACHE_OP_DELETE 0x00000008 -#define DCACHE_OP_PRUNE 0x00000010 +#define DCACHE_OP_HASH BIT(0) +#define DCACHE_OP_COMPARE BIT(1) +#define DCACHE_OP_REVALIDATE BIT(2) +#define DCACHE_OP_DELETE BIT(3) +#define DCACHE_OP_PRUNE BIT(4) -#define DCACHE_DISCONNECTED 0x00000020 +#define DCACHE_DISCONNECTED BIT(5) /* This dentry is possibly not currently connected to the dcache tree, in * which case its parent will either be itself, or will have this flag as * well. nfsd will not use a dentry with this bit set, but will first @@ -168,50 +168,50 @@ struct dentry_operations { * dentry into place and return that dentry rather than the passed one, * typically using d_splice_alias. */ -#define DCACHE_REFERENCED 0x00000040 /* Recently used, don't discard. */ +#define DCACHE_REFERENCED BIT(6) /* Recently used, don't discard. */ -#define DCACHE_DONTCACHE 0x00000080 /* Purge from memory on final dput() */ +#define DCACHE_DONTCACHE BIT(7) /* Purge from memory on final dput() */ -#define DCACHE_CANT_MOUNT 0x00000100 -#define DCACHE_GENOCIDE 0x00000200 -#define DCACHE_SHRINK_LIST 0x00000400 +#define DCACHE_CANT_MOUNT BIT(8) +#define DCACHE_GENOCIDE BIT(9) +#define DCACHE_SHRINK_LIST BIT(10) -#define DCACHE_OP_WEAK_REVALIDATE 0x00000800 +#define DCACHE_OP_WEAK_REVALIDATE BIT(11) -#define DCACHE_NFSFS_RENAMED 0x00001000 +#define DCACHE_NFSFS_RENAMED BIT(12) /* this dentry has been "silly renamed" and has to be deleted on the last * dput() */ -#define DCACHE_COOKIE 0x00002000 /* For use by dcookie subsystem */ -#define DCACHE_FSNOTIFY_PARENT_WATCHED 0x00004000 +#define DCACHE_COOKIE BIT(13) /* For use by dcookie subsystem */ +#define DCACHE_FSNOTIFY_PARENT_WATCHED BIT(14) /* Parent inode is watched by some fsnotify listener */ -#define DCACHE_DENTRY_KILLED 0x00008000 +#define DCACHE_DENTRY_KILLED BIT(15) -#define DCACHE_MOUNTED 0x00010000 /* is a mountpoint */ -#define DCACHE_NEED_AUTOMOUNT 0x00020000 /* handle automount on this dir */ -#define DCACHE_MANAGE_TRANSIT 0x00040000 /* manage transit from this dirent */ +#define DCACHE_MOUNTED BIT(16) /* is a mountpoint */ +#define DCACHE_NEED_AUTOMOUNT BIT(17) /* handle automount on this dir */ +#define DCACHE_MANAGE_TRANSIT BIT(18) /* manage transit from this dirent */ #define DCACHE_MANAGED_DENTRY \ (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT) -#define DCACHE_LRU_LIST 0x00080000 - -#define DCACHE_ENTRY_TYPE 0x00700000 -#define DCACHE_MISS_TYPE 0x00000000 /* Negative dentry (maybe fallthru to nowhere) */ -#define DCACHE_WHITEOUT_TYPE 0x00100000 /* Whiteout dentry (stop pathwalk) */ -#define DCACHE_DIRECTORY_TYPE 0x00200000 /* Normal directory */ -#define DCACHE_AUTODIR_TYPE 0x00300000 /* Lookupless directory (presumed automount) */ -#define DCACHE_REGULAR_TYPE 0x00400000 /* Regular file type (or fallthru to such) */ -#define DCACHE_SPECIAL_TYPE 0x00500000 /* Other file type (or fallthru to such) */ -#define DCACHE_SYMLINK_TYPE 0x00600000 /* Symlink (or fallthru to such) */ - -#define DCACHE_MAY_FREE 0x00800000 -#define DCACHE_FALLTHRU 0x01000000 /* Fall through to lower layer */ -#define DCACHE_NOKEY_NAME 0x02000000 /* Encrypted name encoded without key */ -#define DCACHE_OP_REAL 0x04000000 - -#define DCACHE_PAR_LOOKUP 0x10000000 /* being looked up (with parent locked shared) */ -#define DCACHE_DENTRY_CURSOR 0x20000000 -#define DCACHE_NORCU 0x40000000 /* No RCU delay for freeing */ +#define DCACHE_LRU_LIST BIT(19) + +#define DCACHE_ENTRY_TYPE (7 << 20) /* bits 20..22 are for storing type: */ +#define DCACHE_MISS_TYPE (0 << 20) /* Negative dentry (maybe fallthru to nowhere) */ +#define DCACHE_WHITEOUT_TYPE (1 << 20) /* Whiteout dentry (stop pathwalk) */ +#define DCACHE_DIRECTORY_TYPE (2 << 20) /* Normal directory */ +#define DCACHE_AUTODIR_TYPE (3 << 20) /* Lookupless directory (presumed automount) */ +#define DCACHE_REGULAR_TYPE (4 << 20) /* Regular file type (or fallthru to such) */ +#define DCACHE_SPECIAL_TYPE (5 << 20) /* Other file type (or fallthru to such) */ +#define DCACHE_SYMLINK_TYPE (6 << 20) /* Symlink (or fallthru to such) */ + +#define DCACHE_MAY_FREE BIT(23) +#define DCACHE_FALLTHRU BIT(24) /* Fall through to lower layer */ +#define DCACHE_NOKEY_NAME BIT(25) /* Encrypted name encoded without key */ +#define DCACHE_OP_REAL BIT(26) + +#define DCACHE_PAR_LOOKUP BIT(28) /* being looked up (with parent locked shared) */ +#define DCACHE_DENTRY_CURSOR BIT(29) +#define DCACHE_NORCU BIT(30) /* No RCU delay for freeing */ extern seqlock_t rename_lock; -- cgit v1.2.3 From 0bec65a80f1b1ebcda05286e539a204713b70353 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 10 Nov 2023 12:50:29 -0500 Subject: DCACHE_COOKIE: RIP the last user gone in 2021... Signed-off-by: Al Viro --- include/linux/dcache.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index d9c314cc93b82..92c0b2a1ae2eb 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -181,7 +181,6 @@ struct dentry_operations { #define DCACHE_NFSFS_RENAMED BIT(12) /* this dentry has been "silly renamed" and has to be deleted on the last * dput() */ -#define DCACHE_COOKIE BIT(13) /* For use by dcookie subsystem */ #define DCACHE_FSNOTIFY_PARENT_WATCHED BIT(14) /* Parent inode is watched by some fsnotify listener */ -- cgit v1.2.3 From 8219cb58feddcf28909072015f4e17e29f68c41a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 10 Nov 2023 14:32:05 -0500 Subject: kill d_{is,set}_fallthru() Introduced in 2015 and never had any in-tree users... Signed-off-by: Al Viro --- include/linux/dcache.h | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 92c0b2a1ae2eb..8cd937bb2292d 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -195,16 +195,15 @@ struct dentry_operations { #define DCACHE_LRU_LIST BIT(19) #define DCACHE_ENTRY_TYPE (7 << 20) /* bits 20..22 are for storing type: */ -#define DCACHE_MISS_TYPE (0 << 20) /* Negative dentry (maybe fallthru to nowhere) */ +#define DCACHE_MISS_TYPE (0 << 20) /* Negative dentry */ #define DCACHE_WHITEOUT_TYPE (1 << 20) /* Whiteout dentry (stop pathwalk) */ #define DCACHE_DIRECTORY_TYPE (2 << 20) /* Normal directory */ #define DCACHE_AUTODIR_TYPE (3 << 20) /* Lookupless directory (presumed automount) */ -#define DCACHE_REGULAR_TYPE (4 << 20) /* Regular file type (or fallthru to such) */ -#define DCACHE_SPECIAL_TYPE (5 << 20) /* Other file type (or fallthru to such) */ -#define DCACHE_SYMLINK_TYPE (6 << 20) /* Symlink (or fallthru to such) */ +#define DCACHE_REGULAR_TYPE (4 << 20) /* Regular file type */ +#define DCACHE_SPECIAL_TYPE (5 << 20) /* Other file type */ +#define DCACHE_SYMLINK_TYPE (6 << 20) /* Symlink */ #define DCACHE_MAY_FREE BIT(23) -#define DCACHE_FALLTHRU BIT(24) /* Fall through to lower layer */ #define DCACHE_NOKEY_NAME BIT(25) /* Encrypted name encoded without key */ #define DCACHE_OP_REAL BIT(26) @@ -489,14 +488,6 @@ static inline int simple_positive(const struct dentry *dentry) return d_really_is_positive(dentry) && !d_unhashed(dentry); } -extern void d_set_fallthru(struct dentry *dentry); - -static inline bool d_is_fallthru(const struct dentry *dentry) -{ - return dentry->d_flags & DCACHE_FALLTHRU; -} - - extern int sysctl_vfs_cache_pressure; static inline unsigned long vfs_pressure_ratio(unsigned long val) -- cgit v1.2.3 From 0d486510f86eb8162022ed61e6dc424a10909a10 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 10 Nov 2023 15:22:40 -0500 Subject: dentry.h: trim externs d_instantiate_unique() had been gone for 7 years; __d_lookup...() and shrink_dcache_for_umount() are fs/internal.h fodder. Signed-off-by: Al Viro --- include/linux/dcache.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 8cd937bb2292d..9706bf1dc5de6 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -218,7 +218,6 @@ extern seqlock_t rename_lock; */ extern void d_instantiate(struct dentry *, struct inode *); extern void d_instantiate_new(struct dentry *, struct inode *); -extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *); extern struct dentry * d_instantiate_anon(struct dentry *, struct inode *); extern void __d_drop(struct dentry *dentry); extern void d_drop(struct dentry *dentry); @@ -240,7 +239,6 @@ extern struct dentry * d_obtain_alias(struct inode *); extern struct dentry * d_obtain_root(struct inode *); extern void shrink_dcache_sb(struct super_block *); extern void shrink_dcache_parent(struct dentry *); -extern void shrink_dcache_for_umount(struct super_block *); extern void d_invalidate(struct dentry *); /* only used at mount-time */ @@ -275,9 +273,6 @@ extern struct dentry *d_ancestor(struct dentry *, struct dentry *); /* appendix may either be NULL or be used for transname suffixes */ extern struct dentry *d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); -extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *); -extern struct dentry *__d_lookup_rcu(const struct dentry *parent, - const struct qstr *name, unsigned *seq); static inline unsigned d_count(const struct dentry *dentry) { -- cgit v1.2.3 From 2fcd38f4de7256e2b5cb23ad22a6e3ebfea7dd18 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 10 Nov 2023 15:24:45 -0500 Subject: [software coproarchaeology] dentry.h: kill a mysterious comment there's a strange comment in front of d_lookup() declaration: /* appendix may either be NULL or be used for transname suffixes */ Looks like nobody had been curious enough to track its history; it predates git, it predates bitkeeper and if you look through the pre-BK trees, you finally arrive at this in 2.1.44-for-davem: /* appendix may either be NULL or be used for transname suffixes */ -extern struct dentry * d_lookup(struct inode * dir, struct qstr * name, - struct qstr * appendix); +extern struct dentry * d_lookup(struct dentry * dir, struct qstr * name); In other words, it refers to the third argument d_lookup() used to have back then. It had been introduced in 2.1.43-pre, on June 12 1997, along with d_lookup(), only to be removed by July 4 1997, presumably when the Cthulhu-awful thing it used to be used for (look for CONFIG_TRANS_NAMES in 2.1.43-pre, and keep a heavy-duty barfbag ready) had been, er, noticed and recognized for what it had been. Despite the appendectomy, the comment remained. Some things really need to be put out of their misery... Signed-off-by: Al Viro --- include/linux/dcache.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 9706bf1dc5de6..a5e5e274eee09 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -270,7 +270,6 @@ extern void d_move(struct dentry *, struct dentry *); extern void d_exchange(struct dentry *, struct dentry *); extern struct dentry *d_ancestor(struct dentry *, struct dentry *); -/* appendix may either be NULL or be used for transname suffixes */ extern struct dentry *d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); -- cgit v1.2.3 From 698f1e2b71736977b04f951e2e2ef1c9a80696ff Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 10 Nov 2023 16:19:59 -0500 Subject: kill d_backing_dentry() no users left Signed-off-by: Al Viro --- include/linux/dcache.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index a5e5e274eee09..fa0414cff85c1 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -530,21 +530,6 @@ static inline struct inode *d_backing_inode(const struct dentry *upper) return inode; } -/** - * d_backing_dentry - Get upper or lower dentry we should be using - * @upper: The upper layer - * - * This is the helper that should be used to get the dentry of the inode that - * will be used if this dentry were opened as a file. It may be the upper - * dentry or it may be a lower dentry pinned by the upper. - * - * Normal filesystems should not use this to access their own dentries. - */ -static inline struct dentry *d_backing_dentry(struct dentry *upper) -{ - return upper; -} - /** * d_real - Return the real dentry * @dentry: the dentry to query -- cgit v1.2.3 From 289354f21b2c3fac93e956efd45f256a88a4d997 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 18 Nov 2023 18:38:05 -0800 Subject: net: partial revert of the "Make timestamping selectable: series Revert following commits: commit acec05fb78ab ("net_tstamp: Add TIMESTAMPING SOFTWARE and HARDWARE mask") commit 11d55be06df0 ("net: ethtool: Add a command to expose current time stamping layer") commit bb8645b00ced ("netlink: specs: Introduce new netlink command to get current timestamp") commit d905f9c75329 ("net: ethtool: Add a command to list available time stamping layers") commit aed5004ee7a0 ("netlink: specs: Introduce new netlink command to list available time stamping layers") commit 51bdf3165f01 ("net: Replace hwtstamp_source by timestamping layer") commit 0f7f463d4821 ("net: Change the API of PHY default timestamp to MAC") commit 091fab122869 ("net: ethtool: ts: Update GET_TS to reply the current selected timestamp") commit 152c75e1d002 ("net: ethtool: ts: Let the active time stamping layer be selectable") commit ee60ea6be0d3 ("netlink: specs: Introduce time stamping set command") They need more time for reviews. Link: https://lore.kernel.org/all/20231118183529.6e67100c@kernel.org/ Signed-off-by: Jakub Kicinski --- include/linux/net_tstamp.h | 11 ++++++++--- include/linux/netdevice.h | 5 ----- include/linux/phy.h | 4 ---- 3 files changed, 8 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h index bb289c2ad3762..eb01c37e71e0a 100644 --- a/include/linux/net_tstamp.h +++ b/include/linux/net_tstamp.h @@ -5,6 +5,11 @@ #include +enum hwtstamp_source { + HWTSTAMP_SOURCE_NETDEV, + HWTSTAMP_SOURCE_PHYLIB, +}; + /** * struct kernel_hwtstamp_config - Kernel copy of struct hwtstamp_config * @@ -15,8 +20,8 @@ * a legacy implementation of a lower driver * @copied_to_user: request was passed to a legacy implementation which already * copied the ioctl request back to user space - * @source: indication whether timestamps should come from software, the netdev - * or from an attached phylib PHY + * @source: indication whether timestamps should come from the netdev or from + * an attached phylib PHY * * Prefer using this structure for in-kernel processing of hardware * timestamping configuration, over the inextensible struct hwtstamp_config @@ -28,7 +33,7 @@ struct kernel_hwtstamp_config { int rx_filter; struct ifreq *ifr; bool copied_to_user; - enum timestamping_layer source; + enum hwtstamp_source source; }; static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kernel_cfg, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f020d2790c12f..2d840d7056f20 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -47,7 +47,6 @@ #include #include #include -#include #include #include #include @@ -2075,8 +2074,6 @@ enum netdev_ml_priv_type { * * @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem, * where the clock is recovered. - * @ts_layer: Tracks which network device - * performs packet time stamping. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2438,8 +2435,6 @@ struct net_device { #if IS_ENABLED(CONFIG_DPLL) struct dpll_pin *dpll_pin; #endif - - enum timestamping_layer ts_layer; }; #define to_net_dev(d) container_of(d, struct net_device, dev) diff --git a/include/linux/phy.h b/include/linux/phy.h index 317def2a7843e..e5f1f41e399c7 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -604,8 +604,6 @@ struct macsec_ops; * handling shall be postponed until PHY has resumed * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended, * requiring a rerun of the interrupt handler after resume - * @default_timestamp: Flag indicating whether we are using the phy - * timestamp as the default one * @interface: enum phy_interface_t value * @skb: Netlink message for cable diagnostics * @nest: Netlink nest used for cable diagnostics @@ -669,8 +667,6 @@ struct phy_device { unsigned irq_suspended:1; unsigned irq_rerun:1; - unsigned default_timestamp:1; - int rate_matching; enum phy_state state; -- cgit v1.2.3 From ac40916a3f7243efbe6e129ebf495b5c33a3adfe Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 15 Nov 2023 20:01:08 +0800 Subject: rtnetlink: introduce nlmsg_new_large and use it in rtnl_getlink if a PF has 256 or more VFs, ip link command will allocate an order 3 memory or more, and maybe trigger OOM due to memory fragment, the VFs needed memory size is computed in rtnl_vfinfo_size. so introduce nlmsg_new_large which calls netlink_alloc_large_skb in which vmalloc is used for large memory, to avoid the failure of allocating memory ip invoked oom-killer: gfp_mask=0xc2cc0(GFP_KERNEL|__GFP_NOWARN|\ __GFP_COMP|__GFP_NOMEMALLOC), order=3, oom_score_adj=0 CPU: 74 PID: 204414 Comm: ip Kdump: loaded Tainted: P OE Call Trace: dump_stack+0x57/0x6a dump_header+0x4a/0x210 oom_kill_process+0xe4/0x140 out_of_memory+0x3e8/0x790 __alloc_pages_slowpath.constprop.116+0x953/0xc50 __alloc_pages_nodemask+0x2af/0x310 kmalloc_large_node+0x38/0xf0 __kmalloc_node_track_caller+0x417/0x4d0 __kmalloc_reserve.isra.61+0x2e/0x80 __alloc_skb+0x82/0x1c0 rtnl_getlink+0x24f/0x370 rtnetlink_rcv_msg+0x12c/0x350 netlink_rcv_skb+0x50/0x100 netlink_unicast+0x1b2/0x280 netlink_sendmsg+0x355/0x4a0 sock_sendmsg+0x5b/0x60 ____sys_sendmsg+0x1ea/0x250 ___sys_sendmsg+0x88/0xd0 __sys_sendmsg+0x5e/0xa0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f95a65a5b70 Cc: Yunsheng Lin Signed-off-by: Li RongQing Link: https://lore.kernel.org/r/20231115120108.3711-1-lirongqing@baidu.com Signed-off-by: Jakub Kicinski --- include/linux/netlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 75d7de34c9087..abe91ed6b9aa0 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -351,5 +351,6 @@ bool netlink_ns_capable(const struct sk_buff *skb, struct user_namespace *ns, int cap); bool netlink_capable(const struct sk_buff *skb, int cap); bool netlink_net_capable(const struct sk_buff *skb, int cap); +struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast); #endif /* __LINUX_NETLINK_H */ -- cgit v1.2.3 From f25d34646bd01505a0989ca67bc9a37390cae755 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Fri, 3 Nov 2023 19:25:23 +0100 Subject: platform/x86: wmi: Add wmidev_block_set() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, WMI drivers have to use the deprecated GUID-based interface when setting data blocks. This prevents those drivers from fully moving away from this interface. Provide wmidev_block_set() so drivers using wmi_set_block() can fully migrate to the modern bus-based interface. Tested with a custom SSDT from the Intel Slim Bootloader project. Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20231103182526.3524-1-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/wmi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 763bd382cf2d1..207544968268a 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -35,6 +35,8 @@ extern acpi_status wmidev_evaluate_method(struct wmi_device *wdev, extern union acpi_object *wmidev_block_query(struct wmi_device *wdev, u8 instance); +acpi_status wmidev_block_set(struct wmi_device *wdev, u8 instance, const struct acpi_buffer *in); + u8 wmidev_instance_count(struct wmi_device *wdev); extern int set_required_buffer_size(struct wmi_device *wdev, u64 length); -- cgit v1.2.3 From 7275bf3e09578e1761157e7683f2e898c5c235a6 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Fri, 3 Nov 2023 19:25:24 +0100 Subject: platform/x86: wmi: Add to_wmi_device() helper macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper macro for WMI drivers to cast a device to the corresponding WMI device. This should replace some boilerplate code. Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20231103182526.3524-2-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/wmi.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 207544968268a..8a643c39fcced 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -27,6 +27,14 @@ struct wmi_device { bool setable; }; +/** + * to_wmi_device() - Helper macro to cast a device to a wmi_device + * @device: device struct + * + * Cast a struct device to a struct wmi_device. + */ +#define to_wmi_device(device) container_of(device, struct wmi_device, dev) + extern acpi_status wmidev_evaluate_method(struct wmi_device *wdev, u8 instance, u32 method_id, const struct acpi_buffer *in, -- cgit v1.2.3 From 48c9996f1dfe92bd7318472651c9ad538d6d53b5 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 6 Nov 2023 17:16:37 +0100 Subject: device property: Add SOFTWARE_NODE() macro for defining software nodes Add SOFTWARE_NODE() macro in order to make defining software nodes look nicer. This is analogous to different PROPERTY_ENTRY_*() macros for defining properties. Signed-off-by: Sakari Ailus Reviewed-by: Andy Shevchenko Reviewed-by: Heikki Krogerus Tested-by: Sakari Ailus Signed-off-by: Rafael J. Wysocki --- include/linux/property.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 9f2585d705a86..97f901c0914e3 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -489,6 +489,13 @@ struct software_node { const struct property_entry *properties; }; +#define SOFTWARE_NODE(_name_, _properties_, _parent_) \ + (struct software_node) { \ + .name = _name_, \ + .properties = _properties_, \ + .parent = _parent_, \ + } + bool is_software_node(const struct fwnode_handle *fwnode); const struct software_node * to_software_node(const struct fwnode_handle *fwnode); -- cgit v1.2.3 From 44844db91397d3d94589f3c0c855be02daeebdb3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 9 Nov 2023 16:01:48 +0100 Subject: thermal: core: Add trip thresholds for trip crossing detection The trip crossing detection in handle_thermal_trip() does not work correctly in the cases when a trip point is crossed on the way up and then the zone temperature stays above its low temperature (that is, its temperature decreased by its hysteresis). The trip temperature may be passed by the zone temperature subsequently in that case, even multiple times, but that does not count as the trip crossing as long as the zone temperature does not fall below the trip's low temperature or, in other words, until the trip is crossed on the way down. |-----------low--------high------------| |<--------->| | hyst | | | | -|--> crossed on the way up | <---|-- crossed on the way down However, handle_thermal_trip() will invoke thermal_notify_tz_trip_up() every time the trip temperature is passed by the zone temperature on the way up regardless of whether or not the trip has been crossed on the way down yet. Moreover, it will not call thermal_notify_tz_trip_down() if the last zone temperature was between the trip's temperature and its low temperature, so some "trip crossed on the way down" events may not be reported. To address this issue, introduce trip thresholds equal to either the temperature of the given trip, or its low temperature, such that if the trip's threshold is passed by the zone temperature on the way up, its value will be set to the trip's low temperature and thermal_notify_tz_trip_up() will be called, and if the trip's threshold is passed by the zone temperature on the way down, its value will be set to the trip's temperature (high) and thermal_notify_tz_trip_down() will be called. Accordingly, if the threshold is passed on the way up, it cannot be passed on the way up again until its passed on the way down and if it is passed on the way down, it cannot be passed on the way down again until it is passed on the way up which guarantees correct triggering of trip crossing notifications. If the last temperature of the zone is invalid, the trip's threshold will be set depending of the zone's current temperature: If that temperature is above the trip's temperature, its threshold will be set to its low temperature or otherwise its threshold will be set to its (high) temperature. Because the zone temperature is initially set to invalid and tz->last_temperature is only updated by update_temperature(), this is sufficient to set the correct initial threshold values for all trips. Link: https://lore.kernel.org/all/20220718145038.1114379-4-daniel.lezcano@linaro.org Signed-off-by: Rafael J. Wysocki --- include/linux/thermal.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index cee814d5d1acc..1f9ee869f9f9c 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -57,12 +57,14 @@ enum thermal_notify_event { * struct thermal_trip - representation of a point in temperature domain * @temperature: temperature value in miliCelsius * @hysteresis: relative hysteresis in miliCelsius + * @threshold: trip crossing notification threshold miliCelsius * @type: trip point type * @priv: pointer to driver data associated with this trip */ struct thermal_trip { int temperature; int hysteresis; + int threshold; enum thermal_trip_type type; void *priv; }; -- cgit v1.2.3 From d9dcdb4531fe39ce48919ef8c2c9369ee49f3ad2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 20 Oct 2023 11:21:07 +0200 Subject: PCI: host-generic: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is (mostly) ignored and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new() which already returns void. pci_host_common_remove() returned zero unconditionally. With that converted to return void instead, the generic pci host driver can be switched to .remove_new() trivially. Link: https://lore.kernel.org/r/20231020092107.2148311-1-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Signed-off-by: Bjorn Helgaas Acked-by: Will Deacon --- include/linux/pci-ecam.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h index 6b1301e2498e9..3a4860bd27586 100644 --- a/include/linux/pci-ecam.h +++ b/include/linux/pci-ecam.h @@ -93,6 +93,6 @@ extern const struct pci_ecam_ops loongson_pci_ecam_ops; /* Loongson PCIe */ #if IS_ENABLED(CONFIG_PCI_HOST_COMMON) /* for DT-based PCI controllers that support ECAM */ int pci_host_common_probe(struct platform_device *pdev); -int pci_host_common_remove(struct platform_device *pdev); +void pci_host_common_remove(struct platform_device *pdev); #endif #endif -- cgit v1.2.3 From 3171e46d677a668eed3086da78671f1e4f5b8405 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 30 Oct 2023 13:42:18 +0200 Subject: PCI: Avoid potential out-of-bounds read in pci_dev_for_each_resource() Coverity complains that pointer in the pci_dev_for_each_resource() may be wrong, i.e., might be used for the out-of-bounds read. There is no actual issue right now because we have another check afterwards and the out-of-bounds read is not being performed. In any case it's better code with this fixed, hence the proposed change. As Jonas pointed out "It probably makes the code slightly less performant as res will now be checked for being not NULL (which will always be true), but I doubt it will be significant (or in any hot paths)." Fixes: 09cc90063240 ("PCI: Introduce pci_dev_for_each_resource()") Reported-by: Bjorn Helgaas Closes: https://lore.kernel.org/r/20230509182122.GA1259567@bhelgaas Suggested-by: Jonas Gorski Link: https://lore.kernel.org/r/20231030114218.2752236-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 60ca768bc8679..4ebecc7896ef9 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2127,14 +2127,14 @@ int pci_iobar_pfn(struct pci_dev *pdev, int bar, struct vm_area_struct *vma); (pci_resource_end((dev), (bar)) ? \ resource_size(pci_resource_n((dev), (bar))) : 0) -#define __pci_dev_for_each_res0(dev, res, ...) \ - for (unsigned int __b = 0; \ - res = pci_resource_n(dev, __b), __b < PCI_NUM_RESOURCES; \ +#define __pci_dev_for_each_res0(dev, res, ...) \ + for (unsigned int __b = 0; \ + __b < PCI_NUM_RESOURCES && (res = pci_resource_n(dev, __b)); \ __b++) -#define __pci_dev_for_each_res1(dev, res, __b) \ - for (__b = 0; \ - res = pci_resource_n(dev, __b), __b < PCI_NUM_RESOURCES; \ +#define __pci_dev_for_each_res1(dev, res, __b) \ + for (__b = 0; \ + __b < PCI_NUM_RESOURCES && (res = pci_resource_n(dev, __b)); \ __b++) #define pci_dev_for_each_resource(dev, res, ...) \ -- cgit v1.2.3 From 600f111ef51dc2cbdb330b09d09f1856efa64912 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Nov 2023 21:58:23 +0000 Subject: fs: Rename mapping private members It is hard to find where mapping->private_lock, mapping->private_list and mapping->private_data are used, due to private_XXX being a relatively common name for variables and structure members in the kernel. To fit with other members of struct address_space, rename them all to have an i_ prefix. Tested with an allmodconfig build. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20231117215823.2821906-1-willy@infradead.org Acked-by: Darrick J. Wong Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- include/linux/fs.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e3..f171505940ff7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -463,9 +463,9 @@ extern const struct address_space_operations empty_aops; * @a_ops: Methods. * @flags: Error bits and flags (AS_*). * @wb_err: The most recent error which has occurred. - * @private_lock: For use by the owner of the address_space. - * @private_list: For use by the owner of the address_space. - * @private_data: For use by the owner of the address_space. + * @i_private_lock: For use by the owner of the address_space. + * @i_private_list: For use by the owner of the address_space. + * @i_private_data: For use by the owner of the address_space. */ struct address_space { struct inode *host; @@ -484,9 +484,9 @@ struct address_space { unsigned long flags; struct rw_semaphore i_mmap_rwsem; errseq_t wb_err; - spinlock_t private_lock; - struct list_head private_list; - void *private_data; + spinlock_t i_private_lock; + struct list_head i_private_list; + void * i_private_data; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but -- cgit v1.2.3 From f47507988145185aef5d0e7a0e28dbf6e7776f29 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 17 Oct 2023 22:05:23 +0200 Subject: thermal: ACPI: Move the ACPI thermal library to drivers/acpi/ The ACPI thermal library contains functions that can be used to retrieve trip point temperature values through the platform firmware for various types of trip points. Each of these functions basically evaluates a specific ACPI object, checks if the value produced by it is reasonable and returns it (or THERMAL_TEMP_INVALID if anything fails). It made sense to hold it in drivers/thermal/ so long as it was only used by the code in that directory, but since it is also going to be used by the ACPI thermal driver located in drivers/acpi/, move it to the latter in order to keep the code related to evaluating ACPI objects defined in the specification proper together. No intentional functional impact. Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 7 +++++++ include/linux/thermal.h | 7 ------- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 54189e0e5f419..b63d7811c7287 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -424,6 +424,13 @@ extern int acpi_blacklisted(void); extern void acpi_osi_setup(char *str); extern bool acpi_osi_is_win8(void); +#ifdef CONFIG_ACPI_THERMAL_LIB +int thermal_acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp); +int thermal_acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp); +int thermal_acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp); +int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp); +#endif + #ifdef CONFIG_ACPI_NUMA int acpi_map_pxm_to_node(int pxm); int acpi_get_node(acpi_handle handle); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index cee814d5d1acc..35f6200594569 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -294,13 +294,6 @@ int thermal_zone_get_num_trips(struct thermal_zone_device *tz); int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp); -#ifdef CONFIG_THERMAL_ACPI -int thermal_acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp); -int thermal_acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp); -int thermal_acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp); -int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp); -#endif - #ifdef CONFIG_THERMAL struct thermal_zone_device *thermal_zone_device_register_with_trips( const char *type, -- cgit v1.2.3 From 35732699f5d2922ff674e711e566cf44a4bd86d2 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 22 Nov 2023 08:33:53 -0700 Subject: ACPI: Fix ARM32 platforms compile issue introduced by fw_table changes Linus reported that: After commit a103f46633fd the kernel stopped compiling for several ARM32 platforms that I am building with a bare metal compiler. Bare metal compilers (arm-none-eabi-) don't define __linux__. This is because the header is now in the include path for : CC arch/arm/kernel/irq.o CC kernel/sysctl.o CC crypto/api.o In file included from ../include/acpi/acpi.h:22, from ../include/linux/fw_table.h:29, from ../include/linux/acpi.h:18, from ../include/linux/irqchip.h:14, from ../arch/arm/kernel/irq.c:25: ../include/acpi/platform/acenv.h:218:2: error: #error Unknown target environment 218 | #error Unknown target environment | ^~~~~ The issue is caused by the introducing of splitting out the ACPI code to support the new generic fw_table code. Rafael suggested [1] moving the fw_table.h include in linux/acpi.h to below the linux/mutex.h. Remove the two includes in fw_table.h. Replace linux/fw_table.h include in fw_table.c with linux/acpi.h. Link: https://lore.kernel.org/linux-acpi/CAJZ5v0idWdJq3JSqQWLG5q+b+b=zkEdWR55rGYEoxh7R6N8kFQ@mail.gmail.com/ Fixes: a103f46633fd ("acpi: Move common tables helper functions to common lib") Closes: https://lore.kernel.org/linux-acpi/20231114-arm-build-bug-v1-1-458745fe32a4@linaro.org/ Reported-by: Linus Walleij Suggested-by: Rafael J. Wysocki Tested-by: Linus Walleij Signed-off-by: Dave Jiang Acked-by: Rafael J. Wysocki Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 22 +++++++++++----------- include/linux/fw_table.h | 3 --- 2 files changed, 11 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 54189e0e5f419..4db54e928b36d 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -15,7 +15,6 @@ #include #include #include -#include struct irq_domain; struct irq_domain_ops; @@ -25,22 +24,13 @@ struct irq_domain_ops; #endif #include -#ifdef CONFIG_ACPI_TABLE_LIB -#define EXPORT_SYMBOL_ACPI_LIB(x) EXPORT_SYMBOL_NS_GPL(x, ACPI) -#define __init_or_acpilib -#define __initdata_or_acpilib -#else -#define EXPORT_SYMBOL_ACPI_LIB(x) -#define __init_or_acpilib __init -#define __initdata_or_acpilib __initdata -#endif - #ifdef CONFIG_ACPI #include #include #include #include +#include #include #include @@ -48,6 +38,16 @@ struct irq_domain_ops; #include #include +#ifdef CONFIG_ACPI_TABLE_LIB +#define EXPORT_SYMBOL_ACPI_LIB(x) EXPORT_SYMBOL_NS_GPL(x, ACPI) +#define __init_or_acpilib +#define __initdata_or_acpilib +#else +#define EXPORT_SYMBOL_ACPI_LIB(x) +#define __init_or_acpilib __init +#define __initdata_or_acpilib __initdata +#endif + static inline acpi_handle acpi_device_handle(struct acpi_device *adev) { return adev ? adev->handle : NULL; diff --git a/include/linux/fw_table.h b/include/linux/fw_table.h index ff8fa58d5818b..ca49947f0a775 100644 --- a/include/linux/fw_table.h +++ b/include/linux/fw_table.h @@ -25,9 +25,6 @@ struct acpi_subtable_proc { int count; }; -#include -#include - union acpi_subtable_headers { struct acpi_subtable_header common; struct acpi_hmat_structure hmat; -- cgit v1.2.3 From 21f4c443731fdb064c0dd31a743aafd0b075156c Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 20 Nov 2023 18:47:20 +0100 Subject: soundwire: stream: constify sdw_port_config when adding devices sdw_stream_add_master() and sdw_stream_add_slave() do not modify contents of passed sdw_port_config, so it can be made const for code safety and as documentation of expected usage. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20231120174720.239610-1-krzysztof.kozlowski@linaro.org Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 4f3d14bb15385..904004d8b5622 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -1040,7 +1040,7 @@ int sdw_compute_params(struct sdw_bus *bus); int sdw_stream_add_master(struct sdw_bus *bus, struct sdw_stream_config *stream_config, - struct sdw_port_config *port_config, + const struct sdw_port_config *port_config, unsigned int num_ports, struct sdw_stream_runtime *stream); int sdw_stream_remove_master(struct sdw_bus *bus, @@ -1062,7 +1062,7 @@ void sdw_extract_slave_id(struct sdw_bus *bus, u64 addr, struct sdw_slave_id *id int sdw_stream_add_slave(struct sdw_slave *slave, struct sdw_stream_config *stream_config, - struct sdw_port_config *port_config, + const struct sdw_port_config *port_config, unsigned int num_ports, struct sdw_stream_runtime *stream); int sdw_stream_remove_slave(struct sdw_slave *slave, @@ -1084,7 +1084,7 @@ int sdw_update_no_pm(struct sdw_slave *slave, u32 addr, u8 mask, u8 val); static inline int sdw_stream_add_slave(struct sdw_slave *slave, struct sdw_stream_config *stream_config, - struct sdw_port_config *port_config, + const struct sdw_port_config *port_config, unsigned int num_ports, struct sdw_stream_runtime *stream) { -- cgit v1.2.3 From 57b79ac9f43dc71fc8b55af51d1c9f469cb7a0de Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Thu, 5 May 2022 01:25:16 +0900 Subject: soc: apple: rtkit: Get rid of apple_rtkit_send_message_wait It is fundamentally broken and has no users. Just remove it. Acked-by: Eric Curtin Acked-by: Neal Gompa Acked-by: Alyssa Rosenzweig Signed-off-by: Hector Martin --- include/linux/soc/apple/rtkit.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/apple/rtkit.h b/include/linux/soc/apple/rtkit.h index fc456f75c1319..8c9ca857ccf6a 100644 --- a/include/linux/soc/apple/rtkit.h +++ b/include/linux/soc/apple/rtkit.h @@ -160,24 +160,6 @@ int apple_rtkit_start_ep(struct apple_rtkit *rtk, u8 endpoint); int apple_rtkit_send_message(struct apple_rtkit *rtk, u8 ep, u64 message, struct completion *completion, bool atomic); -/* - * Send a message to the given endpoint and wait until it has been submitted - * to the hardware FIFO. - * Will return zero on success and a negative error code on failure - * (e.g. -ETIME when the message couldn't be written within the given - * timeout) - * - * @rtk: RTKit reference - * @ep: target endpoint - * @message: message to be sent - * @timeout: timeout in milliseconds to allow the message transmission - * to be completed - * @atomic: if set to true this function can be called from atomic - * context. - */ -int apple_rtkit_send_message_wait(struct apple_rtkit *rtk, u8 ep, u64 message, - unsigned long timeout, bool atomic); - /* * Process incoming messages in atomic context. * This only guarantees that messages arrive as far as the recv_message_early -- cgit v1.2.3 From 143897c4fa976d02bfafe5ae32b9ffc60dc6145a Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Tue, 14 Mar 2023 19:47:32 +0900 Subject: mailbox: apple: Delete driver This driver is now orphaned and superseded by drivers/soc/apple/mailbox.c. Acked-by: Eric Curtin Acked-by: Neal Gompa Acked-by: Alyssa Rosenzweig Signed-off-by: Hector Martin --- include/linux/apple-mailbox.h | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 include/linux/apple-mailbox.h (limited to 'include/linux') diff --git a/include/linux/apple-mailbox.h b/include/linux/apple-mailbox.h deleted file mode 100644 index 720fbb70294aa..0000000000000 --- a/include/linux/apple-mailbox.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only OR MIT */ -/* - * Apple mailbox message format - * - * Copyright (C) 2021 The Asahi Linux Contributors - */ - -#ifndef _LINUX_APPLE_MAILBOX_H_ -#define _LINUX_APPLE_MAILBOX_H_ - -#include - -/* encodes a single 96bit message sent over the single channel */ -struct apple_mbox_msg { - u64 msg0; - u32 msg1; -}; - -#endif -- cgit v1.2.3 From 9c0b4bb7f6303c9c4e2e34984c46f5a86478f84d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 22 Nov 2023 14:39:03 +0100 Subject: sched/cpufreq: Rework schedutil governor performance estimation The current method to take into account uclamp hints when estimating the target frequency can end in a situation where the selected target frequency is finally higher than uclamp hints, whereas there are no real needs. Such cases mainly happen because we are currently mixing the traditional scheduler utilization signal with the uclamp performance hints. By adding these 2 metrics, we loose an important information when it comes to select the target frequency, and we have to make some assumptions which can't fit all cases. Rework the interface between the scheduler and schedutil governor in order to propagate all information down to the cpufreq governor. effective_cpu_util() interface changes and now returns the actual utilization of the CPU with 2 optional inputs: - The minimum performance for this CPU; typically the capacity to handle the deadline task and the interrupt pressure. But also uclamp_min request when available. - The maximum targeting performance for this CPU which reflects the maximum level that we would like to not exceed. By default it will be the CPU capacity but can be reduced because of some performance hints set with uclamp. The value can be lower than actual utilization and/or min performance level. A new sugov_effective_cpu_perf() interface is also available to compute the final performance level that is targeted for the CPU, after applying some cpufreq headroom and taking into account all inputs. With these 2 functions, schedutil is now able to decide when it must go above uclamp hints. It now also has a generic way to get the min performance level. The dependency between energy model and cpufreq governor and its headroom policy doesn't exist anymore. eenv_pd_max_util() asks schedutil for the targeted performance after applying the impact of the waking task. [ mingo: Refined the changelog & C comments. ] Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20231122133904.446032-2-vincent.guittot@linaro.org --- include/linux/energy_model.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index b9caa01dfac48..adec808b371a1 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -243,7 +243,6 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, scale_cpu = arch_scale_cpu_capacity(cpu); ps = &pd->table[pd->nr_perf_states - 1]; - max_util = map_util_perf(max_util); max_util = min(max_util, allowed_cpu_cap); freq = map_util_freq(max_util, ps->frequency, scale_cpu); -- cgit v1.2.3 From 64bac5ea17d527872121adddfee869c7a0618f8f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 8 Nov 2023 13:58:29 +0100 Subject: arch: consolidate arch_irq_work_raise prototypes The prototype was hidden in an #ifdef on x86, which causes a warning: kernel/irq_work.c:72:13: error: no previous prototype for 'arch_irq_work_raise' [-Werror=missing-prototypes] Some architectures have a working prototype, while others don't. Fix this by providing it in only one place that is always visible. Reviewed-by: Alexander Gordeev Acked-by: Catalin Marinas Acked-by: Palmer Dabbelt Acked-by: Guo Ren Signed-off-by: Arnd Bergmann --- include/linux/irq_work.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 8cd11a2232605..136f2980cba30 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -66,6 +66,9 @@ void irq_work_sync(struct irq_work *work); void irq_work_run(void); bool irq_work_needs_cpu(void); void irq_work_single(void *arg); + +void arch_irq_work_raise(void); + #else static inline bool irq_work_needs_cpu(void) { return false; } static inline void irq_work_run(void) { } -- cgit v1.2.3 From a769154c7cac037914ba375ae88aae55b2c853e0 Mon Sep 17 00:00:00 2001 From: Hardik Gajjar Date: Fri, 27 Oct 2023 17:20:28 +0200 Subject: usb: xhci: Add timeout argument in address_device USB HCD callback - The HCD address_device callback now accepts a user-defined timeout value in milliseconds, providing better control over command execution times. - The default timeout value for the address_device command has been set to 5000 ms, aligning with the USB 3.2 specification. However, this timeout can be adjusted as needed. - The xhci_setup_device function has been updated to accept the timeout value, allowing it to specify the maximum wait time for the command operation to complete. - The hub driver has also been updated to accommodate the newly added timeout parameter during the SET_ADDRESS request. Signed-off-by: Hardik Gajjar Reviewed-by: Mathias Nyman Link: https://lore.kernel.org/r/20231027152029.104363-1-hgajjar@de.adit-jv.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/hcd.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index 00724b4f6e122..cd77fc6095a15 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -372,8 +372,9 @@ struct hc_driver { * or bandwidth constraints. */ void (*reset_bandwidth)(struct usb_hcd *, struct usb_device *); - /* Returns the hardware-chosen device address */ - int (*address_device)(struct usb_hcd *, struct usb_device *udev); + /* Set the hardware-chosen device address */ + int (*address_device)(struct usb_hcd *, struct usb_device *udev, + unsigned int timeout_ms); /* prepares the hardware to send commands to the device */ int (*enable_device)(struct usb_hcd *, struct usb_device *udev); /* Notifies the HCD after a hub descriptor is fetched. -- cgit v1.2.3 From 5a1ccf0c72cf917ff3ccc131d1bb8d19338ffe52 Mon Sep 17 00:00:00 2001 From: Hardik Gajjar Date: Fri, 27 Oct 2023 17:20:29 +0200 Subject: usb: new quirk to reduce the SET_ADDRESS request timeout This patch introduces a new USB quirk, USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT, which modifies the timeout value for the SET_ADDRESS request. The standard timeout for USB request/command is 5000 ms, as recommended in the USB 3.2 specification (section 9.2.6.1). However, certain scenarios, such as connecting devices through an APTIV hub, can lead to timeout errors when the device enumerates as full speed initially and later switches to high speed during chirp negotiation. In such cases, USB analyzer logs reveal that the bus suspends for 5 seconds due to incorrect chirp parsing and resumes only after two consecutive timeout errors trigger a hub driver reset. Packet(54) Dir(?) Full Speed J(997.100 us) Idle( 2.850 us) _______| Time Stamp(28 . 105 910 682) _______|_____________________________________________________________Ch0 Packet(55) Dir(?) Full Speed J(997.118 us) Idle( 2.850 us) _______| Time Stamp(28 . 106 910 632) _______|_____________________________________________________________Ch0 Packet(56) Dir(?) Full Speed J(399.650 us) Idle(222.582 us) _______| Time Stamp(28 . 107 910 600) _______|_____________________________________________________________Ch0 Packet(57) Dir Chirp J( 23.955 ms) Idle(115.169 ms) _______| Time Stamp(28 . 108 532 832) _______|_____________________________________________________________Ch0 Packet(58) Dir(?) Full Speed J (Suspend)( 5.347 sec) Idle( 5.366 us) _______| Time Stamp(28 . 247 657 600) _______|_____________________________________________________________Ch0 This 5-second delay in device enumeration is undesirable, particularly in automotive applications where quick enumeration is crucial (ideally within 3 seconds). The newly introduced quirks provide the flexibility to align with a 3-second time limit, as required in specific contexts like automotive applications. By reducing the SET_ADDRESS request timeout to 500 ms, the system can respond more swiftly to errors, initiate rapid recovery, and ensure efficient device enumeration. This change is vital for scenarios where rapid smartphone enumeration and screen projection are essential. To use the quirk, please write "vendor_id:product_id:p" to /sys/bus/usb/drivers/hub/module/parameter/quirks For example, echo "0x2c48:0x0132:p" > /sys/bus/usb/drivers/hub/module/parameters/quirks" Signed-off-by: Hardik Gajjar Reviewed-by: Alan Stern Link: https://lore.kernel.org/r/20231027152029.104363-2-hgajjar@de.adit-jv.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/quirks.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h index eeb7c2157c72f..59409c1fc3dee 100644 --- a/include/linux/usb/quirks.h +++ b/include/linux/usb/quirks.h @@ -72,4 +72,7 @@ /* device has endpoints that should be ignored */ #define USB_QUIRK_ENDPOINT_IGNORE BIT(15) +/* short SET_ADDRESS request timeout */ +#define USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT BIT(16) + #endif /* __LINUX_USB_QUIRKS_H */ -- cgit v1.2.3 From d4e3b928ab487a8aecd1f6a140b40ac365116cfb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 17 Nov 2023 19:13:27 -0500 Subject: closures: CLOSURE_CALLBACK() to fix type punning Control flow integrity is now checking that type signatures match on indirect function calls. That breaks closures, which embed a work_struct in a closure in such a way that a closure_fn may also be used as a workqueue fn by the underlying closure code. So we have to change closure fns to take a work_struct as their argument - but that results in a loss of clarity, as closure fns have different semantics from normal workqueue functions (they run owning a ref on the closure, which must be released with continue_at() or closure_return()). Thus, this patc introduces CLOSURE_CALLBACK() and closure_type() macros as suggested by Kees, to smooth things over a bit. Suggested-by: Kees Cook Cc: Coly Li Signed-off-by: Kent Overstreet --- include/linux/closure.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/closure.h b/include/linux/closure.h index de7bb47d8a46a..c554c6a08768a 100644 --- a/include/linux/closure.h +++ b/include/linux/closure.h @@ -104,7 +104,7 @@ struct closure; struct closure_syncer; -typedef void (closure_fn) (struct closure *); +typedef void (closure_fn) (struct work_struct *); extern struct dentry *bcache_debug; struct closure_waitlist { @@ -254,7 +254,7 @@ static inline void closure_queue(struct closure *cl) INIT_WORK(&cl->work, cl->work.func); BUG_ON(!queue_work(wq, &cl->work)); } else - cl->fn(cl); + cl->fn(&cl->work); } /** @@ -309,6 +309,11 @@ static inline void closure_wake_up(struct closure_waitlist *list) __closure_wake_up(list); } +#define CLOSURE_CALLBACK(name) void name(struct work_struct *ws) +#define closure_type(name, type, member) \ + struct closure *cl = container_of(ws, struct closure, work); \ + type *name = container_of(cl, type, member) + /** * continue_at - jump to another function with barrier * -- cgit v1.2.3 From 6543ac13c623f906200dfd3f1c407d8d333b6995 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Tue, 17 Oct 2023 11:09:32 -0500 Subject: soundwire: bus: introduce controller_id The existing SoundWire support misses a clear Controller/Manager hiearchical definition to deal with all variants across SOC vendors. a) Intel platforms have one controller with 4 or more Managers. b) AMD platforms have two controllers with one Manager each, but due to BIOS issues use two different link_id values within the scope of a single controller. c) QCOM platforms have one or more controller with one Manager each. This patch adds a 'controller_id' which can be set by higher levels. If assigned to -1, the controller_id will be set to the system-unique IDA-assigned bus->id. The main change is that the bus->id is no longer used for any device name, which makes the definition completely predictable and not dependent on any enumeration order. The bus->id is only used to insert the Managers in the stream rt context. Reviewed-by: Bard Liao Reviewed-by: Vijendar Mukunda Signed-off-by: Pierre-Louis Bossart Reviewed-by: Krzysztof Kozlowski Tested-by: Krzysztof Kozlowski Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/stable/20231017160933.12624-2-pierre-louis.bossart%40linux.intel.com Tested-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20231017160933.12624-2-pierre-louis.bossart@linux.intel.com Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 904004d8b5622..66f814b63a435 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -886,7 +886,8 @@ struct sdw_master_ops { * struct sdw_bus - SoundWire bus * @dev: Shortcut to &bus->md->dev to avoid changing the entire code. * @md: Master device - * @link_id: Link id number, can be 0 to N, unique for each Master + * @controller_id: system-unique controller ID. If set to -1, the bus @id will be used. + * @link_id: Link id number, can be 0 to N, unique for each Controller * @id: bus system-wide unique id * @slaves: list of Slaves on this bus * @assigned: Bitmap for Slave device numbers. @@ -918,6 +919,7 @@ struct sdw_master_ops { struct sdw_bus { struct device *dev; struct sdw_master_device *md; + int controller_id; unsigned int link_id; int id; struct list_head slaves; -- cgit v1.2.3 From 8802e580ee643e3f63c6b39ff64e7c7baa4a55ba Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2023 14:27:13 +0200 Subject: fs: create __sb_write_started() helper Similar to sb_write_started() for use by other sb freeze levels. Unlike the boolean sb_write_started(), this helper returns a tristate to distiguish the cases of lockdep disabled or unknown lock state. This is needed for fanotify "pre content" events. Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231122122715.2561213-15-amir73il@gmail.com Reviewed-by: Josef Bacik Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e3..ac8b5a9b467b7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1645,9 +1645,23 @@ static inline bool __sb_start_write_trylock(struct super_block *sb, int level) #define __sb_writers_release(sb, lev) \ percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) +/** + * __sb_write_started - check if sb freeze level is held + * @sb: the super we write to + * @level: the freeze level + * + * > 0 sb freeze level is held + * 0 sb freeze level is not held + * < 0 !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN + */ +static inline int __sb_write_started(const struct super_block *sb, int level) +{ + return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1); +} + static inline bool sb_write_started(const struct super_block *sb) { - return lockdep_is_held_type(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1, 1); + return __sb_write_started(sb, SB_FREEZE_WRITE); } /** -- cgit v1.2.3 From 3d5cd4911e04683df8f4439fddd788e00a2510a8 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2023 14:27:14 +0200 Subject: fs: create file_write_started() helper Convenience wrapper for sb_write_started(file_inode(inode)->i_sb)), which has a single occurrence in the code right now. Document the false negatives of those helpers, which makes them unusable to assert that sb_start_write() is not held. Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231122122715.2561213-16-amir73il@gmail.com Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- include/linux/fs.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ac8b5a9b467b7..75a10b632edd5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1659,11 +1659,32 @@ static inline int __sb_write_started(const struct super_block *sb, int level) return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1); } +/** + * sb_write_started - check if SB_FREEZE_WRITE is held + * @sb: the super we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + */ static inline bool sb_write_started(const struct super_block *sb) { return __sb_write_started(sb, SB_FREEZE_WRITE); } +/** + * file_write_started - check if SB_FREEZE_WRITE is held + * @file: the file we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + * May be false positive with !S_ISREG, because file_start_write() has + * no effect on !S_ISREG. + */ +static inline bool file_write_started(const struct file *file) +{ + if (!S_ISREG(file_inode(file)->i_mode)) + return true; + return sb_write_started(file_inode(file)->i_sb); +} + /** * sb_end_write - drop write access to a superblock * @sb: the super we wrote to -- cgit v1.2.3 From 21b32e6a0ab5b174fa1ca2fb4c212577cf405d83 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2023 14:27:15 +0200 Subject: fs: create {sb,file}_write_not_started() helpers Create new helpers {sb,file}_write_not_started() that can be used to assert that sb_start_write() is not held. This is needed for fanotify "pre content" events. Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231122122715.2561213-17-amir73il@gmail.com Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- include/linux/fs.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 75a10b632edd5..ae0e2fb7bcea8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1670,6 +1670,17 @@ static inline bool sb_write_started(const struct super_block *sb) return __sb_write_started(sb, SB_FREEZE_WRITE); } +/** + * sb_write_not_started - check if SB_FREEZE_WRITE is not held + * @sb: the super we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + */ +static inline bool sb_write_not_started(const struct super_block *sb) +{ + return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0; +} + /** * file_write_started - check if SB_FREEZE_WRITE is held * @file: the file we write to @@ -1685,6 +1696,21 @@ static inline bool file_write_started(const struct file *file) return sb_write_started(file_inode(file)->i_sb); } +/** + * file_write_not_started - check if SB_FREEZE_WRITE is not held + * @file: the file we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + * May be false positive with !S_ISREG, because file_start_write() has + * no effect on !S_ISREG. + */ +static inline bool file_write_not_started(const struct file *file) +{ + if (!S_ISREG(file_inode(file)->i_mode)) + return true; + return sb_write_not_started(file_inode(file)->i_sb); +} + /** * sb_end_write - drop write access to a superblock * @sb: the super we wrote to -- cgit v1.2.3 From 2afae08c9dcb8ac648414277cec70c2fe6a34d9e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 23 Nov 2023 19:59:36 -0800 Subject: bpf: Validate global subprogs lazily Slightly change BPF verifier logic around eagerness and order of global subprog validation. Instead of going over every global subprog eagerly and validating it before main (entry) BPF program is verified, turn it around. Validate main program first, mark subprogs that were called from main program for later verification, but otherwise assume it is valid. Afterwards, go over marked global subprogs and validate those, potentially marking some more global functions as being called. Continue this process until all (transitively) callable global subprogs are validated. It's a BFS traversal at its heart and will always converge. This is an important change because it allows to feature-gate some subprograms that might not be verifiable on some older kernel, depending on supported set of features. E.g., at some point, global functions were allowed to accept a pointer to memory, which size is identified by user-provided type. Unfortunately, older kernels don't support this feature. With BPF CO-RE approach, the natural way would be to still compile BPF object file once and guard calls to this global subprog with some CO-RE check or using .rodata variables. That's what people do to guard usage of new helpers or kfuncs, and any other new BPF-side feature that might be missing on old kernels. That's currently impossible to do with global subprogs, unfortunately, because they are eagerly and unconditionally validated. This patch set aims to change this, so that in the future when global funcs gain new features, those can be guarded using BPF CO-RE techniques in the same fashion as any other new kernel feature. Two selftests had to be adjusted in sync with these changes. test_global_func12 relied on eager global subprog validation failing before main program failure is detected (unknown return value). Fix by making sure that main program is always valid. verifier_subprog_precision's parent_stack_slot_precise subtest relied on verifier checkpointing heuristic to do a checkpoint at instruction #5, but that's no longer true because we don't have enough jumps validated before reaching insn #5 due to global subprogs being validated later. Other than that, no changes, as one would expect. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20231124035937.403208-3-andrii@kernel.org --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 258ba232e3021..eb447b0a94231 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1347,6 +1347,8 @@ static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog) struct bpf_func_info_aux { u16 linkage; bool unreliable; + bool called : 1; + bool verified : 1; }; enum bpf_jit_poke_reason { -- cgit v1.2.3 From ef5828805842204dd0259ecfc132b5916c8a77ae Mon Sep 17 00:00:00 2001 From: Michael-CY Lee Date: Wed, 22 Nov 2023 11:02:37 +0800 Subject: wifi: avoid offset calculation on NULL pointer ieee80211_he_6ghz_oper() can be passed a NULL pointer and checks for that, but already did the calculation to inside of it before. Move it after the check. Signed-off-by: Michael-CY Lee Link: https://lore.kernel.org/r/20231122030237.31276-1-michael-cy.lee@mediatek.com [rewrite commit message] Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 958771bac9c02..c2ac9e9e7ee9a 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2830,12 +2830,14 @@ ieee80211_he_oper_size(const u8 *he_oper_ie) static inline const struct ieee80211_he_6ghz_oper * ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper) { - const u8 *ret = (const void *)&he_oper->optional; + const u8 *ret; u32 he_oper_params; if (!he_oper) return NULL; + ret = (const void *)&he_oper->optional; + he_oper_params = le32_to_cpu(he_oper->he_oper_params); if (!(he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO)) -- cgit v1.2.3 From d3ca4ab4f16eb81dc3e7721251adcba49b229d54 Mon Sep 17 00:00:00 2001 From: Liam Kearney Date: Wed, 25 Oct 2023 11:27:55 +1100 Subject: wifi: ieee80211: fix PV1 frame control field name Update PV1 frame control field TODS to FROMDS to match 802.11 standard Signed-off-by: Liam Kearney Reviewed-by: Jeff Johnson Link: https://lore.kernel.org/r/20231025002755.1752983-1-liam.kearney@morsemicro.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 958771bac9c02..5e5ea216f3413 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -172,11 +172,11 @@ #define IEEE80211_SN_MODULO (IEEE80211_MAX_SN + 1) -/* PV1 Layout 11ah 9.8.3.1 */ +/* PV1 Layout IEEE 802.11-2020 9.8.3.1 */ #define IEEE80211_PV1_FCTL_VERS 0x0003 #define IEEE80211_PV1_FCTL_FTYPE 0x001c #define IEEE80211_PV1_FCTL_STYPE 0x00e0 -#define IEEE80211_PV1_FCTL_TODS 0x0100 +#define IEEE80211_PV1_FCTL_FROMDS 0x0100 #define IEEE80211_PV1_FCTL_MOREFRAGS 0x0200 #define IEEE80211_PV1_FCTL_PM 0x0400 #define IEEE80211_PV1_FCTL_MOREDATA 0x0800 -- cgit v1.2.3 From d1f7728259ef02ac20b7afb6e7eb5a9eb1696c25 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 15 Nov 2023 17:49:59 +0100 Subject: gpiolib: provide gpio_device_get_label() Provide a getter for the GPIO device label string so that users don't have to dereference struct gpio_chip directly. Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 0aed62f0c6330..100c329dc986c 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -786,6 +786,7 @@ struct gpio_device *gpiod_to_gpio_device(struct gpio_desc *desc); /* struct gpio_device getters */ int gpio_device_get_base(struct gpio_device *gdev); +const char *gpio_device_get_label(struct gpio_device *gdev); #else /* CONFIG_GPIOLIB */ -- cgit v1.2.3 From a066f906ba396ab00d4af19fc5fad42b2605582a Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Wed, 22 Nov 2023 14:52:43 +0100 Subject: firmware_loader: Expand Firmware upload error codes with firmware invalid error No error code are available to signal an invalid firmware content. Drivers that can check the firmware content validity can not return this specific failure to the user-space Expand the firmware error code with an additional code: - "firmware invalid" code which can be used when the provided firmware is invalid Sync lib/test_firmware.c file accordingly. Acked-by: Luis Chamberlain Acked-by: Greg Kroah-Hartman Signed-off-by: Kory Maincent Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20231122-feature_firmware_error_code-v3-1-04ec753afb71@bootlin.com Signed-off-by: Jakub Kicinski --- include/linux/firmware.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware.h b/include/linux/firmware.h index de7fea3bca51e..0311858b46cef 100644 --- a/include/linux/firmware.h +++ b/include/linux/firmware.h @@ -27,6 +27,7 @@ struct firmware { * @FW_UPLOAD_ERR_INVALID_SIZE: invalid firmware image size * @FW_UPLOAD_ERR_RW_ERROR: read or write to HW failed, see kernel log * @FW_UPLOAD_ERR_WEAROUT: FLASH device is approaching wear-out, wait & retry + * @FW_UPLOAD_ERR_FW_INVALID: invalid firmware file * @FW_UPLOAD_ERR_MAX: Maximum error code marker */ enum fw_upload_err { @@ -38,6 +39,7 @@ enum fw_upload_err { FW_UPLOAD_ERR_INVALID_SIZE, FW_UPLOAD_ERR_RW_ERROR, FW_UPLOAD_ERR_WEAROUT, + FW_UPLOAD_ERR_FW_INVALID, FW_UPLOAD_ERR_MAX }; -- cgit v1.2.3 From b286f4e87e325b76789f30337c98ba72e00532e2 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 13 Nov 2023 10:07:52 +0200 Subject: serial: core: Move tty and serdev to be children of serial core port device Let's move tty and serdev controller to be children of the serial core port device. This way the runtime PM usage count of a child device propagates to the serial hardware device. The tty and serdev devices are associated with a specific serial port of a serial hardware controller device, and we now have serial core hierarchy of controllers and ports. The tty device moves happily with just a change of the parent device and update of device_find_child() handling. The serdev device init needs some changes to separate the serial hardware controller device from the parent device. With this change the tty devices move under sysfs similar to this x86_64 qemu example of a diff of "find /sys -name ttyS*": /sys/class/tty/ttyS0 /sys/class/tty/ttyS3 /sys/class/tty/ttyS1 -/sys/devices/pnp0/00:04/tty/ttyS0 -/sys/devices/platform/serial8250/tty/ttyS2 -/sys/devices/platform/serial8250/tty/ttyS3 -/sys/devices/platform/serial8250/tty/ttyS1 +/sys/devices/pnp0/00:04/00:04:0/00:04:0.0/tty/ttyS0 +/sys/devices/platform/serial8250/serial8250:0/serial8250:0.3/tty/ttyS3 +/sys/devices/platform/serial8250/serial8250:0/serial8250:0.1/tty/ttyS1 +/sys/devices/platform/serial8250/serial8250:0/serial8250:0.2/tty/ttyS2 If a serdev device is used instead of a tty, it moves in a similar way. Suggested-by: Johan Hovold Cc: Maximilian Luz Cc: Rob Herring Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20231113080758.30346-1-tony@atomide.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serdev.h | 8 +++++++- include/linux/tty_port.h | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serdev.h b/include/linux/serdev.h index f5f97fa25e8ad..0ebf53bb254fa 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -99,12 +99,14 @@ struct serdev_controller_ops { /** * struct serdev_controller - interface to the serdev controller * @dev: Driver model representation of the device. + * @host: Serial port hardware controller device * @nr: number identifier for this controller/bus. * @serdev: Pointer to slave device for this controller. * @ops: Controller operations. */ struct serdev_controller { struct device dev; + struct device *host; unsigned int nr; struct serdev_device *serdev; const struct serdev_controller_ops *ops; @@ -167,7 +169,9 @@ struct serdev_device *serdev_device_alloc(struct serdev_controller *); int serdev_device_add(struct serdev_device *); void serdev_device_remove(struct serdev_device *); -struct serdev_controller *serdev_controller_alloc(struct device *, size_t); +struct serdev_controller *serdev_controller_alloc(struct device *host, + struct device *parent, + size_t size); int serdev_controller_add(struct serdev_controller *); void serdev_controller_remove(struct serdev_controller *); @@ -311,11 +315,13 @@ struct tty_driver; #ifdef CONFIG_SERIAL_DEV_CTRL_TTYPORT struct device *serdev_tty_port_register(struct tty_port *port, + struct device *host, struct device *parent, struct tty_driver *drv, int idx); int serdev_tty_port_unregister(struct tty_port *port); #else static inline struct device *serdev_tty_port_register(struct tty_port *port, + struct device *host, struct device *parent, struct tty_driver *drv, int idx) { diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h index 6b367eb17979a..3276311a7f384 100644 --- a/include/linux/tty_port.h +++ b/include/linux/tty_port.h @@ -149,10 +149,10 @@ struct device *tty_port_register_device_attr(struct tty_port *port, const struct attribute_group **attr_grp); struct device *tty_port_register_device_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, - struct device *device); + struct device *host, struct device *parent); struct device *tty_port_register_device_attr_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, - struct device *device, void *drvdata, + struct device *host, struct device *parent, void *drvdata, const struct attribute_group **attr_grp); void tty_port_unregister_device(struct tty_port *port, struct tty_driver *driver, unsigned index); -- cgit v1.2.3 From 358779dd18c1e8531bd6d78c19ed802958d7c677 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Tue, 21 Nov 2023 10:22:44 +0100 Subject: tty: fix tty_operations types in documentation Commits 95713967ba52 ("tty: make tty_operations::write()'s count size_t") and dcaafbe6ee3b ("tty: propagate u8 data to tty_operations::put_char()") changed types of characters to u8, but omitted to fix the documentation. Fix the latter now. Signed-off-by: Jiri Slaby (SUSE) Link: https://lore.kernel.org/r/20231121092258.9334-4-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_driver.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 18beff0cec1ab..f428c1b784a20 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -72,8 +72,7 @@ struct serial_struct; * is closed for the last time freeing up the resources. This is * actually the second part of shutdown for routines that might sleep. * - * @write: ``ssize_t ()(struct tty_struct *tty, const unsigned char *buf, - * size_t count)`` + * @write: ``ssize_t ()(struct tty_struct *tty, const u8 *buf, size_t count)`` * * This routine is called by the kernel to write a series (@count) of * characters (@buf) to the @tty device. The characters may come from @@ -85,7 +84,7 @@ struct serial_struct; * * Optional: Required for writable devices. May not sleep. * - * @put_char: ``int ()(struct tty_struct *tty, unsigned char ch)`` + * @put_char: ``int ()(struct tty_struct *tty, u8 ch)`` * * This routine is called by the kernel to write a single character @ch to * the @tty device. If the kernel uses this routine, it must call the -- cgit v1.2.3 From 239123e7e8ec4d35c8591c48f5de44925a88391d Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Tue, 21 Nov 2023 10:22:45 +0100 Subject: tty: move locking docs out of Returns for functions in tty.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both tty_kref_get() and tty_get_baud_rate() note about locking in their Return kernel-doc clause. Extract this info into a separate "Locking" paragraph -- the same as we do for other tty functions. Signed-off-by: "Jiri Slaby (SUSE)" Suggested-by: Ilpo Järvinen Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231121092258.9334-5-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index 4b6340ac2af28..7625fc98fef3f 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -393,8 +393,10 @@ extern const struct class tty_class; * tty_kref_get - get a tty reference * @tty: tty device * - * Returns: a new reference to a tty object. The caller must hold sufficient - * locks/counts to ensure that their existing reference cannot go away + * Returns: a new reference to a tty object + * + * Locking: The caller must hold sufficient locks/counts to ensure that their + * existing reference cannot go away. */ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty) { @@ -436,10 +438,10 @@ void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud, * tty_get_baud_rate - get tty bit rates * @tty: tty to query * - * Returns: the baud rate as an integer for this terminal. The termios lock - * must be held by the caller and the terminal bit flags may be updated. + * Returns: the baud rate as an integer for this terminal * - * Locking: none + * Locking: The termios lock must be held by the caller and the terminal bit + * flags may be updated. */ static inline speed_t tty_get_baud_rate(struct tty_struct *tty) { -- cgit v1.2.3 From da549bdd15c295c24b2ee7ffe7ad0f3877fa8a87 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 7 Nov 2023 02:00:39 -0500 Subject: dentry: switch the lists of children to hlist Saves a pointer per struct dentry and actually makes the things less clumsy. Cleaned the d_walk() and dcache_readdir() a bit by use of hlist_for_... iterators. A couple of new helpers - d_first_child() and d_next_sibling(), to make the expressions less awful. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/dcache.h | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 3da2f0545d5d7..0e397a0c519c5 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -68,12 +68,12 @@ extern const struct qstr dotdot_name; * large memory footprint increase). */ #ifdef CONFIG_64BIT -# define DNAME_INLINE_LEN 32 /* 192 bytes */ +# define DNAME_INLINE_LEN 40 /* 192 bytes */ #else # ifdef CONFIG_SMP -# define DNAME_INLINE_LEN 36 /* 128 bytes */ -# else # define DNAME_INLINE_LEN 40 /* 128 bytes */ +# else +# define DNAME_INLINE_LEN 44 /* 128 bytes */ # endif #endif @@ -101,8 +101,8 @@ struct dentry { struct list_head d_lru; /* LRU list */ wait_queue_head_t *d_wait; /* in-lookup ones only */ }; - struct list_head d_child; /* child of parent list */ - struct list_head d_subdirs; /* our children */ + struct hlist_node d_sib; /* child of parent list */ + struct hlist_head d_children; /* our children */ /* * d_alias and d_rcu can share memory */ @@ -600,4 +600,14 @@ struct name_snapshot { void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *); void release_dentry_name_snapshot(struct name_snapshot *); +static inline struct dentry *d_first_child(const struct dentry *dentry) +{ + return hlist_entry_safe(dentry->d_children.first, struct dentry, d_sib); +} + +static inline struct dentry *d_next_sibling(const struct dentry *dentry) +{ + return hlist_entry_safe(dentry->d_sib.next, struct dentry, d_sib); +} + #endif /* __LINUX_DCACHE_H */ -- cgit v1.2.3 From f2824db1b49f947ba6e208ddf02edf4b1391480a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Nov 2023 16:42:43 -0500 Subject: kill d_instantate_anon(), fold __d_instantiate_anon() into remaining caller now that the only user of d_instantiate_anon() is gone... [braino fix folded - kudos to Dan Carpenter] Signed-off-by: Al Viro --- include/linux/dcache.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index fa0414cff85c1..8c5e3bdf11475 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -218,7 +218,6 @@ extern seqlock_t rename_lock; */ extern void d_instantiate(struct dentry *, struct inode *); extern void d_instantiate_new(struct dentry *, struct inode *); -extern struct dentry * d_instantiate_anon(struct dentry *, struct inode *); extern void __d_drop(struct dentry *dentry); extern void d_drop(struct dentry *dentry); extern void d_delete(struct dentry *); -- cgit v1.2.3 From 8a54b38f3e5ced6cc4b246b8e54bd0f50deceaa8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 11 Nov 2023 16:01:27 -0500 Subject: d_genocide(): move the extern into fs/internal.h Signed-off-by: Al Viro --- include/linux/dcache.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 8c5e3bdf11475..b4324d47f249a 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -243,9 +243,6 @@ extern void d_invalidate(struct dentry *); /* only used at mount-time */ extern struct dentry * d_make_root(struct inode *); -/* - the ramfs-type tree */ -extern void d_genocide(struct dentry *); - extern void d_mark_tmpfile(struct file *, struct inode *); extern void d_tmpfile(struct file *, struct inode *); -- cgit v1.2.3 From 57851607326a2beef21e67f83f4f53a90df8445a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Nov 2023 21:38:48 -0500 Subject: get rid of DCACHE_GENOCIDE ... now that we never call d_genocide() other than from kill_litter_super() Signed-off-by: Al Viro --- include/linux/dcache.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index b4324d47f249a..981f529c6cb55 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -173,7 +173,6 @@ struct dentry_operations { #define DCACHE_DONTCACHE BIT(7) /* Purge from memory on final dput() */ #define DCACHE_CANT_MOUNT BIT(8) -#define DCACHE_GENOCIDE BIT(9) #define DCACHE_SHRINK_LIST BIT(10) #define DCACHE_OP_WEAK_REVALIDATE BIT(11) -- cgit v1.2.3 From 1b327b5ac57cf83e3d015de45d0142852f475375 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 10 Nov 2023 14:07:43 -0500 Subject: kill DCACHE_MAY_FREE With the new ordering in __dentry_kill() it has become redundant - it's set if and only if both DCACHE_DENTRY_KILLED and DCACHE_SHRINK_LIST are set. We set it in __dentry_kill(), after having set DCACHE_DENTRY_KILLED with the only condition being that DCACHE_SHRINK_LIST is there; all of that is done without dropping ->d_lock and the only place that checks that flag (shrink_dentry_list()) does so under ->d_lock, after having found the victim on its shrink list. Since DCACHE_SHRINK_LIST is set only when placing dentry into shrink list and removed only by shrink_dentry_list() itself, a check for DCACHE_DENTRY_KILLED in there would be equivalent to check for DCACHE_MAY_FREE. Signed-off-by: Al Viro --- include/linux/dcache.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index b4449a1a47ffe..48b393545ec20 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -202,7 +202,6 @@ struct dentry_operations { #define DCACHE_SPECIAL_TYPE (5 << 20) /* Other file type */ #define DCACHE_SYMLINK_TYPE (6 << 20) /* Symlink */ -#define DCACHE_MAY_FREE BIT(23) #define DCACHE_NOKEY_NAME BIT(25) /* Encrypted name encoded without key */ #define DCACHE_OP_REAL BIT(26) -- cgit v1.2.3 From 1b6ae9f6e6c3e3c35aad0f11b116a81780b8aa03 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Mon, 6 Nov 2023 14:44:17 +0100 Subject: dcache: remove unnecessary NULL check in dget_dlock() dget_dlock() requires dentry->d_lock to be held when called, yet contains a NULL check for dentry. An audit of all calls to dget_dlock() shows that it is never called with a NULL pointer (as spin_lock()/spin_unlock() would crash in these cases): $ git grep -W '\' arch/powerpc/platforms/cell/spufs/inode.c- spin_lock(&dentry->d_lock); arch/powerpc/platforms/cell/spufs/inode.c- if (simple_positive(dentry)) { arch/powerpc/platforms/cell/spufs/inode.c: dget_dlock(dentry); fs/autofs/expire.c- spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); fs/autofs/expire.c- if (simple_positive(child)) { fs/autofs/expire.c: dget_dlock(child); fs/autofs/root.c: dget_dlock(active); fs/autofs/root.c- spin_unlock(&active->d_lock); fs/autofs/root.c: dget_dlock(expiring); fs/autofs/root.c- spin_unlock(&expiring->d_lock); fs/ceph/dir.c- if (!spin_trylock(&dentry->d_lock)) fs/ceph/dir.c- continue; [...] fs/ceph/dir.c: dget_dlock(dentry); fs/ceph/mds_client.c- spin_lock(&alias->d_lock); [...] fs/ceph/mds_client.c: dn = dget_dlock(alias); fs/configfs/inode.c- spin_lock(&dentry->d_lock); fs/configfs/inode.c- if (simple_positive(dentry)) { fs/configfs/inode.c: dget_dlock(dentry); fs/libfs.c: found = dget_dlock(d); fs/libfs.c- spin_unlock(&d->d_lock); fs/libfs.c: found = dget_dlock(child); fs/libfs.c- spin_unlock(&child->d_lock); fs/libfs.c: child = dget_dlock(d); fs/libfs.c- spin_unlock(&d->d_lock); fs/ocfs2/dcache.c: dget_dlock(dentry); fs/ocfs2/dcache.c- spin_unlock(&dentry->d_lock); include/linux/dcache.h:static inline struct dentry *dget_dlock(struct dentry *dentry) After taking out the NULL check, dget_dlock() becomes almost identical to __dget_dlock(); the only difference is that dget_dlock() returns the dentry that was passed in. These are static inline helpers, so we can rely on the compiler to discard unused return values. We can therefore also remove __dget_dlock() and replace calls to it by dget_dlock(). Also fix up and improve the kerneldoc comments while we're at it. Al Viro pointed out that we can also clean up some of the callers to make use of the returned value and provided a bit more info for the kerneldoc. While preparing v2 I also noticed that the tabs used in the kerneldoc comments were causing the kerneldoc to get parsed incorrectly so I also fixed this up (including for d_unhashed, which is otherwise unrelated). Testing: x86 defconfig build + boot; make htmldocs for the kerneldoc warning. objdump shows there are code generation changes. Link: https://lore.kernel.org/all/20231022164520.915013-1-vegard.nossum@oracle.com/ Cc: Alexander Viro Cc: Christian Brauner Cc: linux-fsdevel@vger.kernel.org Cc: Nick Piggin Cc: Waiman Long Cc: linux-doc@vger.kernel.org Signed-off-by: Vegard Nossum Signed-off-by: Al Viro --- include/linux/dcache.h | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 48b393545ec20..1666c387861f7 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -287,20 +287,40 @@ extern char *dentry_path(const struct dentry *, char *, int); /* Allocation counts.. */ /** - * dget, dget_dlock - get a reference to a dentry - * @dentry: dentry to get a reference to + * dget_dlock - get a reference to a dentry + * @dentry: dentry to get a reference to * - * Given a dentry or %NULL pointer increment the reference count - * if appropriate and return the dentry. A dentry will not be - * destroyed when it has references. + * Given a live dentry, increment the reference count and return the dentry. + * Caller must hold @dentry->d_lock. Making sure that dentry is alive is + * caller's resonsibility. There are many conditions sufficient to guarantee + * that; e.g. anything with non-negative refcount is alive, so's anything + * hashed, anything positive, anyone's parent, etc. */ static inline struct dentry *dget_dlock(struct dentry *dentry) { - if (dentry) - dentry->d_lockref.count++; + dentry->d_lockref.count++; return dentry; } + +/** + * dget - get a reference to a dentry + * @dentry: dentry to get a reference to + * + * Given a dentry or %NULL pointer increment the reference count + * if appropriate and return the dentry. A dentry will not be + * destroyed when it has references. Conversely, a dentry with + * no references can disappear for any number of reasons, starting + * with memory pressure. In other words, that primitive is + * used to clone an existing reference; using it on something with + * zero refcount is a bug. + * + * NOTE: it will spin if @dentry->d_lock is held. From the deadlock + * avoidance point of view it is equivalent to spin_lock()/increment + * refcount/spin_unlock(), so calling it under @dentry->d_lock is + * always a bug; so's calling it under ->d_lock on any of its descendents. + * + */ static inline struct dentry *dget(struct dentry *dentry) { if (dentry) @@ -311,12 +331,11 @@ static inline struct dentry *dget(struct dentry *dentry) extern struct dentry *dget_parent(struct dentry *dentry); /** - * d_unhashed - is dentry hashed - * @dentry: entry to check + * d_unhashed - is dentry hashed + * @dentry: entry to check * - * Returns true if the dentry passed is not currently hashed. + * Returns true if the dentry passed is not currently hashed. */ - static inline int d_unhashed(const struct dentry *dentry) { return hlist_bl_unhashed(&dentry->d_hash); -- cgit v1.2.3 From 18caaedaf4c3712ab6821f292598a8f86e6d7972 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 26 Nov 2023 09:56:29 +0100 Subject: locking/lockdep: Slightly reorder 'struct lock_class' to save some memory Based on pahole, 2 holes can be combined in the 'struct lock_class'. This saves 8 bytes in the structure on my x86_64. On a x86_64 configured with allmodconfig, this saves ~64kb of memory in 'kernel/locking/lockdep.o': text data bss dec filename Before: 102,501 1,912,490 11,531,636 13,546,627 kernel/locking/lockdep.o After: 102,181 1,912,490 11,466,100 13,480,771 kernel/locking/lockdep.o because of: struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; After the reorder, pahole gives: struct lock_class { struct hlist_node hash_entry; /* 0 16 */ struct list_head lock_entry; /* 16 16 */ struct list_head locks_after; /* 32 16 */ struct list_head locks_before; /* 48 16 */ /* --- cacheline 1 boundary (64 bytes) --- */ const struct lockdep_subclass_key * key; /* 64 8 */ lock_cmp_fn cmp_fn; /* 72 8 */ lock_print_fn print_fn; /* 80 8 */ unsigned int subclass; /* 88 4 */ unsigned int dep_gen_id; /* 92 4 */ long unsigned int usage_mask; /* 96 8 */ const struct lock_trace * usage_traces[10]; /* 104 80 */ /* --- cacheline 2 boundary (128 bytes) was 56 bytes ago --- */ const char * name; /* 184 8 */ /* --- cacheline 3 boundary (192 bytes) --- */ int name_version; /* 192 4 */ u8 wait_type_inner; /* 196 1 */ u8 wait_type_outer; /* 197 1 */ u8 lock_type; /* 198 1 */ /* XXX 1 byte hole, try to pack */ long unsigned int contention_point[4]; /* 200 32 */ long unsigned int contending_point[4]; /* 232 32 */ /* size: 264, cachelines: 5, members: 18 */ /* sum members: 263, holes: 1, sum holes: 1 */ /* last cacheline: 8 bytes */ }; Signed-off-by: Christophe JAILLET Signed-off-by: Ingo Molnar Acked-by: Waiman Long Link: https://lore.kernel.org/r/801258371fc4101f96495a5aaecef638d6cbd8d3.1700988869.git.christophe.jaillet@wanadoo.fr --- include/linux/lockdep_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h index 2ebc323d345ae..857d785e89e6a 100644 --- a/include/linux/lockdep_types.h +++ b/include/linux/lockdep_types.h @@ -127,12 +127,12 @@ struct lock_class { unsigned long usage_mask; const struct lock_trace *usage_traces[LOCK_TRACE_STATES]; + const char *name; /* * Generation counter, when doing certain classes of graph walking, * to ensure that we check one node only once: */ int name_version; - const char *name; u8 wait_type_inner; u8 wait_type_outer; -- cgit v1.2.3 From a2e7e59a94269484a83386972ca07c22fd188854 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 15 Nov 2023 18:25:44 +0000 Subject: iommu: Avoid more races around device probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It turns out there are more subtle races beyond just the main part of __iommu_probe_device() itself running in parallel - the dev_iommu_free() on the way out of an unsuccessful probe can still manage to trip up concurrent accesses to a device's fwspec. Thus, extend the scope of iommu_probe_device_lock() to also serialise fwspec creation and initial retrieval. Reported-by: Zhenhua Huang Link: https://lore.kernel.org/linux-iommu/e2e20e1c-6450-4ac5-9804-b0000acdf7de@quicinc.com/ Fixes: 01657bc14a39 ("iommu: Avoid races around device probe") Signed-off-by: Robin Murphy Acked-by: Greg Kroah-Hartman Reviewed-by: André Draszik Tested-by: André Draszik Link: https://lore.kernel.org/r/16f433658661d7cadfea51e7c65da95826112a2b.1700071477.git.robin.murphy@arm.com Cc: stable@vger.kernel.org Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ec289c1016f5f..6291aa7b079b0 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -845,6 +845,7 @@ static inline void dev_iommu_priv_set(struct device *dev, void *priv) dev->iommu->priv = priv; } +extern struct mutex iommu_probe_device_lock; int iommu_probe_device(struct device *dev); int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features f); -- cgit v1.2.3 From a9c362db39207c4934c9125e56ed730c5297c37c Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 21 Nov 2023 18:03:59 +0000 Subject: iommu: Validate that devices match domains Before we can allow drivers to coexist, we need to make sure that one driver's domain ops can't misinterpret another driver's dev_iommu_priv data. To that end, add a token to the domain so we can remember how it was allocated - for now this may as well be the device ops, since they still correlate 1:1 with drivers. We can trust ourselves for internal default domain attachment, so add checks to cover all the public attach interfaces. Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Jerry Snitselaar Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/097c6f30480e4efe12195d00ba0e84ea4837fb4c.1700589539.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index ec289c1016f5f..077bf8cae2f7a 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -106,7 +106,7 @@ struct iommu_domain { unsigned type; const struct iommu_domain_ops *ops; const struct iommu_dirty_ops *dirty_ops; - + const struct iommu_ops *owner; /* Whose domain_alloc we came from */ unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */ struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; -- cgit v1.2.3 From 17de3f5fdd35676b0e3d41c7c9bf4e3032eb3673 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 21 Nov 2023 18:04:02 +0000 Subject: iommu: Retire bus ops With the rest of the API internals converted, it's time to finally tackle probe_device and how we bootstrap the per-device ops association to begin with. This ends up being disappointingly straightforward, since fwspec users are already doing it in order to find their of_xlate callback, and it works out that we can easily do the equivalent for other drivers too. Then shuffle the remaining awareness of iommu_ops into the couple of core headers that still need it, and breathe a sigh of relief. Ding dong the bus ops are gone! CC: Rafael J. Wysocki Acked-by: Christoph Hellwig Acked-by: Greg Kroah-Hartman Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Jerry Snitselaar Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/a59011ef65b4b6657cb0b7a388d786b779b61305.1700589539.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel --- include/linux/device.h | 1 - include/linux/device/bus.h | 5 ----- include/linux/dma-map-ops.h | 1 + 3 files changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index d7a72a8749ea0..0314dbbdb5345 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -42,7 +42,6 @@ struct class; struct subsys_private; struct device_node; struct fwnode_handle; -struct iommu_ops; struct iommu_group; struct dev_pin_info; struct dev_iommu; diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index ae10c43227543..e25aab08f873d 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -62,9 +62,6 @@ struct fwnode_handle; * this bus. * @pm: Power management operations of this bus, callback the specific * device driver's pm-ops. - * @iommu_ops: IOMMU specific operations for this bus, used to attach IOMMU - * driver implementations to a bus and allow the driver to do - * bus-specific setup * @need_parent_lock: When probing or removing a device on this bus, the * device core should lock the device's parent. * @@ -104,8 +101,6 @@ struct bus_type { const struct dev_pm_ops *pm; - const struct iommu_ops *iommu_ops; - bool need_parent_lock; }; diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index f2fc203fb8a1a..a52e508d1869f 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -11,6 +11,7 @@ #include struct cma; +struct iommu_ops; /* * Values for struct dma_map_ops.flags: -- cgit v1.2.3 From 17b226dcf80ce79d02f4f0b08813d8848885b986 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Fri, 24 Nov 2023 15:24:33 +0100 Subject: iommu: Allow passing custom allocators to pgtable drivers This will be useful for GPU drivers who want to keep page tables in a pool so they can: - keep freed page tables in a free pool and speed-up upcoming page table allocations - batch page table allocation instead of allocating one page at a time - pre-reserve pages for page tables needed for map/unmap operations, to ensure map/unmap operations don't try to allocate memory in paths they're allowed to block or fail It might also be valuable for other aspects of GPU and similar use-cases, like fine-grained memory accounting and resource limiting. We will extend the Arm LPAE format to support custom allocators in a separate commit. Signed-off-by: Boris Brezillon Reviewed-by: Steven Price Reviewed-by: Robin Murphy Link: https://lore.kernel.org/r/20231124142434.1577550-2-boris.brezillon@collabora.com Signed-off-by: Joerg Roedel --- include/linux/io-pgtable.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 25142a0e2fc2c..86cf1f7ae389a 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -100,6 +100,30 @@ struct io_pgtable_cfg { const struct iommu_flush_ops *tlb; struct device *iommu_dev; + /** + * @alloc: Custom page allocator. + * + * Optional hook used to allocate page tables. If this function is NULL, + * @free must be NULL too. + * + * Memory returned should be zeroed and suitable for dma_map_single() and + * virt_to_phys(). + * + * Not all formats support custom page allocators. Before considering + * passing a non-NULL value, make sure the chosen page format supports + * this feature. + */ + void *(*alloc)(void *cookie, size_t size, gfp_t gfp); + + /** + * @free: Custom page de-allocator. + * + * Optional hook used to free page tables allocated with the @alloc + * hook. Must be non-NULL if @alloc is not NULL, must be NULL + * otherwise. + */ + void (*free)(void *cookie, void *pages, size_t size); + /* Low-level data specific to the table format */ union { struct { @@ -241,16 +265,26 @@ io_pgtable_tlb_add_page(struct io_pgtable *iop, iop->cfg.tlb->tlb_add_page(gather, iova, granule, iop->cookie); } +/** + * enum io_pgtable_caps - IO page table backend capabilities. + */ +enum io_pgtable_caps { + /** @IO_PGTABLE_CAP_CUSTOM_ALLOCATOR: Backend accepts custom page table allocators. */ + IO_PGTABLE_CAP_CUSTOM_ALLOCATOR = BIT(0), +}; + /** * struct io_pgtable_init_fns - Alloc/free a set of page tables for a * particular format. * * @alloc: Allocate a set of page tables described by cfg. * @free: Free the page tables associated with iop. + * @caps: Combination of @io_pgtable_caps flags encoding the backend capabilities. */ struct io_pgtable_init_fns { struct io_pgtable *(*alloc)(struct io_pgtable_cfg *cfg, void *cookie); void (*free)(struct io_pgtable *iop); + u32 caps; }; extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns; -- cgit v1.2.3 From 8c88a474357ead632b07c70bf7f119ace8c3b39e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 24 Nov 2023 17:25:26 +0100 Subject: debugfs: add API to allow debugfs operations cancellation In some cases there might be longer-running hardware accesses in debugfs files, or attempts to acquire locks, and we want to still be able to quickly remove the files. Introduce a cancellations API to use inside the debugfs handler functions to be able to cancel such operations on a per-file basis. Acked-by: Greg Kroah-Hartman Signed-off-by: Johannes Berg --- include/linux/debugfs.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index ea2d919fd9c79..c9c65b132c0fd 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -171,6 +171,25 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf, size_t count, loff_t *ppos); +/** + * struct debugfs_cancellation - cancellation data + * @list: internal, for keeping track + * @cancel: callback to call + * @cancel_data: extra data for the callback to call + */ +struct debugfs_cancellation { + struct list_head list; + void (*cancel)(struct dentry *, void *); + void *cancel_data; +}; + +void __acquires(cancellation) +debugfs_enter_cancellation(struct file *file, + struct debugfs_cancellation *cancellation); +void __releases(cancellation) +debugfs_leave_cancellation(struct file *file, + struct debugfs_cancellation *cancellation); + #else #include -- cgit v1.2.3 From 668bfeeabb5e402e3b36992f7859c284cc6e594d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 Nov 2023 08:20:02 +0100 Subject: block: move a few definitions out of CONFIG_BLK_DEV_ZONED Allow using a few symbols with IS_ENABLED instead of #idef by moving the declarations out of #idef CONFIG_BLK_DEV_ZONED, and move bdev_nr_zones into the remaining #idef CONFIG_BLK_DEV_ZONED, #else block below. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231127072002.1332685-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 51fa7ffdee83b..17c0a7d0d319e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -331,22 +331,13 @@ typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model); -#ifdef CONFIG_BLK_DEV_ZONED #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); -unsigned int bdev_nr_zones(struct block_device *bdev); -extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, - sector_t sectors, sector_t nr_sectors, - gfp_t gfp_mask); + unsigned int nr_zones, report_zones_cb cb, void *data); +int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, + sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask); int blk_revalidate_disk_zones(struct gendisk *disk, - void (*update_driver_data)(struct gendisk *disk)); -#else /* CONFIG_BLK_DEV_ZONED */ -static inline unsigned int bdev_nr_zones(struct block_device *bdev) -{ - return 0; -} -#endif /* CONFIG_BLK_DEV_ZONED */ + void (*update_driver_data)(struct gendisk *disk)); /* * Independent access ranges: struct blk_independent_access_range describes @@ -643,6 +634,8 @@ static inline bool blk_queue_is_zoned(struct request_queue *q) } #ifdef CONFIG_BLK_DEV_ZONED +unsigned int bdev_nr_zones(struct block_device *bdev); + static inline unsigned int disk_nr_zones(struct gendisk *disk) { return blk_queue_is_zoned(disk->queue) ? disk->nr_zones : 0; @@ -687,6 +680,11 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev) } #else /* CONFIG_BLK_DEV_ZONED */ +static inline unsigned int bdev_nr_zones(struct block_device *bdev) +{ + return 0; +} + static inline unsigned int disk_nr_zones(struct gendisk *disk) { return 0; -- cgit v1.2.3 From 95ba893c9f4feb836ddce627efd0bb6af6667031 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Tue, 14 Nov 2023 13:37:09 +0100 Subject: dma-buf: fix check in dma_resv_add_fence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's valid to add the same fence multiple times to a dma-resv object and we shouldn't need one extra slot for each. Signed-off-by: Christian König Reviewed-by: Thomas Hellström Fixes: a3f7c10a269d5 ("dma-buf/dma-resv: check if the new fence is really later") Cc: stable@vger.kernel.org # v5.19+ Link: https://patchwork.freedesktop.org/patch/msgid/20231115093035.1889-1-christian.koenig@amd.com --- include/linux/dma-fence.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index ebe78bd3d121d..b3772edca2e6e 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -498,6 +498,21 @@ static inline bool dma_fence_is_later(struct dma_fence *f1, return __dma_fence_is_later(f1->seqno, f2->seqno, f1->ops); } +/** + * dma_fence_is_later_or_same - return true if f1 is later or same as f2 + * @f1: the first fence from the same context + * @f2: the second fence from the same context + * + * Returns true if f1 is chronologically later than f2 or the same fence. Both + * fences must be from the same context, since a seqno is not re-used across + * contexts. + */ +static inline bool dma_fence_is_later_or_same(struct dma_fence *f1, + struct dma_fence *f2) +{ + return f1 == f2 || dma_fence_is_later(f1, f2); +} + /** * dma_fence_later - return the chronologically later fence * @f1: the first fence from the same context -- cgit v1.2.3 From 243ad8df7a1bd24c2e01bd99d9f0bb88844dae91 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 24 Nov 2023 12:27:52 +0000 Subject: net: phy: add possible interfaces Add a possible_interfaces member to struct phy_device to indicate which interfaces a clause 45 PHY may switch between depending on the media. This must be populated by the PHY driver by the time the .config_init() method completes according to the PHYs host-side configuration. For example, the Marvell 88x3310 PHY can switch between 10GBASE-R, 5GBASE-R, 2500BASE-X, and SGMII on the host side depending on the media side speed, so all these interface modes are set in the possible_interfaces member. This allows phylib users (such as phylink) to know in advance which interface modes to expect, which allows them to appropriately restrict the advertised link modes according to the capabilities of other parts of the link. Tested-by: Luo Jie Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/E1r6VHk-00DDLN-I7@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index e5f1f41e399c7..6e7ebcc50b859 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -605,6 +605,8 @@ struct macsec_ops; * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended, * requiring a rerun of the interrupt handler after resume * @interface: enum phy_interface_t value + * @possible_interfaces: bitmap if interface modes that the attached PHY + * will switch between depending on media speed. * @skb: Netlink message for cable diagnostics * @nest: Netlink nest used for cable diagnostics * @ehdr: nNtlink header for cable diagnostics @@ -674,6 +676,7 @@ struct phy_device { u32 dev_flags; phy_interface_t interface; + DECLARE_PHY_INTERFACE_MASK(possible_interfaces); /* * forced speed & duplex (no autoneg) -- cgit v1.2.3 From 073d3d2ca7d462afc8159ca0175675b9b7b4f162 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 27 Oct 2023 12:40:04 +0530 Subject: OPP: Level zero is valid The level zero can be used by some OPPs to drop performance state vote for the device. It is perfectly fine to allow the same. _set_opp_level() considers it as an invalid value currently and returns early. In order to support this properly, initialize the level field with U32_MAX, which denotes unused level field. Reported-by: Stephan Gerhold Reviewed-by: Ulf Hansson Tested-by: Stephan Gerhold Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index ccd97bcef2694..af53101a13839 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -92,9 +92,12 @@ struct dev_pm_opp_config { struct device ***virt_devs; }; +#define OPP_LEVEL_UNSET U32_MAX + /** * struct dev_pm_opp_data - The data to use to initialize an OPP. - * @level: The performance level for the OPP. + * @level: The performance level for the OPP. Set level to OPP_LEVEL_UNSET if + * level field isn't used. * @freq: The clock rate in Hz for the OPP. * @u_volt: The voltage in uV for the OPP. */ -- cgit v1.2.3 From e37440e7e2c2760475d60c5556b59c8880a7fd63 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 27 Oct 2023 14:17:48 +0530 Subject: OPP: Call dev_pm_opp_set_opp() for required OPPs Configuring the required OPP was never properly implemented, we just took an exception for genpds and configured them directly, while leaving out all other required OPP types. Now that a standard call to dev_pm_opp_set_opp() takes care of configuring the opp->level too, the special handling for genpds can be avoided by simply calling dev_pm_opp_set_opp() for the required OPPs, which shall eventually configure the corresponding level for genpds. This also makes it possible for us to configure other type of required OPPs (no concrete users yet though), via the same path. This is how other frameworks take care of parent nodes, like clock, regulators, etc, where we recursively call the same helper. In order to call dev_pm_opp_set_opp() for the virtual genpd devices, they must share the OPP table of the genpd. Call _add_opp_dev() for them to get that done. This commit also extends the struct dev_pm_opp_config to pass required devices, for non-genpd cases, which can be used to call dev_pm_opp_set_opp() for the non-genpd required devices. Reviewed-by: Ulf Hansson Tested-by: Stephan Gerhold Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index af53101a13839..81dff7facdc98 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -74,8 +74,10 @@ typedef int (*config_clks_t)(struct device *dev, struct opp_table *opp_table, * @supported_hw_count: Number of elements in the array. * @regulator_names: Array of pointers to the names of the regulator, NULL terminated. * @genpd_names: Null terminated array of pointers containing names of genpd to - * attach. - * @virt_devs: Pointer to return the array of virtual devices. + * attach. Mutually exclusive with required_devs. + * @virt_devs: Pointer to return the array of genpd virtual devices. Mutually + * exclusive with required_devs. + * @required_devs: Required OPP devices. Mutually exclusive with genpd_names/virt_devs. * * This structure contains platform specific OPP configurations for the device. */ @@ -90,6 +92,7 @@ struct dev_pm_opp_config { const char * const *regulator_names; const char * const *genpd_names; struct device ***virt_devs; + struct device **required_devs; }; #define OPP_LEVEL_UNSET U32_MAX -- cgit v1.2.3 From 3652117f854819a148ff0fbe4492587d3520b5e5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Nov 2023 13:48:23 +0100 Subject: eventfd: simplify eventfd_signal() Ever since the eventfd type was introduced back in 2007 in commit e1ad7468c77d ("signal/timer/event: eventfd core") the eventfd_signal() function only ever passed 1 as a value for @n. There's no point in keeping that additional argument. Link: https://lore.kernel.org/r/20231122-vfs-eventfd-signal-v2-2-bd549b14ce0c@kernel.org Acked-by: Xu Yilun Acked-by: Andrew Donnellan # ocxl Acked-by: Eric Farman # s390 Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/eventfd.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index b9d83652c097a..5620894315514 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -35,7 +35,7 @@ void eventfd_ctx_put(struct eventfd_ctx *ctx); struct file *eventfd_fget(int fd); struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); -__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n); +__u64 eventfd_signal(struct eventfd_ctx *ctx); __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask); int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt); @@ -58,7 +58,7 @@ static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd) return ERR_PTR(-ENOSYS); } -static inline int eventfd_signal(struct eventfd_ctx *ctx, __u64 n) +static inline int eventfd_signal(struct eventfd_ctx *ctx) { return -ENOSYS; } -- cgit v1.2.3 From 120ae58593630819209a011a3f9c89f73bcc9894 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Nov 2023 13:48:24 +0100 Subject: eventfd: simplify eventfd_signal_mask() The eventfd_signal_mask() helper was introduced for io_uring and similar to eventfd_signal() it always passed 1 for @n. So don't bother with that argument at all. Link: https://lore.kernel.org/r/20231122-vfs-eventfd-signal-v2-3-bd549b14ce0c@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/eventfd.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index 5620894315514..971943ecb2a63 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -36,7 +36,7 @@ struct file *eventfd_fget(int fd); struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); __u64 eventfd_signal(struct eventfd_ctx *ctx); -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask); +__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask); int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt); void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt); @@ -63,8 +63,7 @@ static inline int eventfd_signal(struct eventfd_ctx *ctx) return -ENOSYS; } -static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, - unsigned mask) +static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) { return -ENOSYS; } -- cgit v1.2.3 From b7638ad0c7802ea854599ce753d0e6d20690f7e2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Nov 2023 13:48:25 +0100 Subject: eventfd: make eventfd_signal{_mask}() void No caller care about the return value. Link: https://lore.kernel.org/r/20231122-vfs-eventfd-signal-v2-4-bd549b14ce0c@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/eventfd.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index 971943ecb2a63..e32bee4345fb9 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -35,8 +35,7 @@ void eventfd_ctx_put(struct eventfd_ctx *ctx); struct file *eventfd_fget(int fd); struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); -__u64 eventfd_signal(struct eventfd_ctx *ctx); -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask); +void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask); int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt); void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt); @@ -58,14 +57,8 @@ static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd) return ERR_PTR(-ENOSYS); } -static inline int eventfd_signal(struct eventfd_ctx *ctx) +static inline void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) { - return -ENOSYS; -} - -static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) -{ - return -ENOSYS; } static inline void eventfd_ctx_put(struct eventfd_ctx *ctx) @@ -91,5 +84,10 @@ static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) #endif +static inline void eventfd_signal(struct eventfd_ctx *ctx) +{ + eventfd_signal_mask(ctx, 0); +} + #endif /* _LINUX_EVENTFD_H */ -- cgit v1.2.3 From e65a29f0235a438ece414d2d99bbf0d31aa97d04 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Nov 2023 13:44:37 +0100 Subject: mnt_idmapping: remove check_fsmapping() The helper is a bit pointless. Just open-code the check. Link: https://lore.kernel.org/r/20231122-vfs-mnt_idmap-v1-1-dae4abdde5bd@kernel.org Signed-off-by: Christian Brauner --- include/linux/mnt_idmapping.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index b8da2db4ecd29..cd4d5c8781f54 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -244,7 +244,4 @@ static inline kgid_t mapped_fsgid(struct mnt_idmap *idmap, return from_vfsgid(idmap, fs_userns, VFSGIDT_INIT(current_fsgid())); } -bool check_fsmapping(const struct mnt_idmap *idmap, - const struct super_block *sb); - #endif /* _LINUX_MNT_IDMAPPING_H */ -- cgit v1.2.3 From 783822e44594639848b78d4bb61dde26fba04e05 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 22 Nov 2023 13:44:39 +0100 Subject: mnt_idmapping: decouple from namespaces There's no reason we need to couple mnt idmapping to namespaces in the way we currently do. Copy the idmapping when an idmapped mount is created and don't take any reference on the namespace at all. We also can't easily refcount struct uid_gid_map because it needs to stay the size of a cacheline otherwise we risk performance regressions (Ignoring for a second that right now struct uid_gid_map isn't actually 64 byte but 72 but that's a fix for another patch series.). Link: https://lore.kernel.org/r/20231122-vfs-mnt_idmap-v1-3-dae4abdde5bd@kernel.org Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- include/linux/uidgid.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h index b0542cd11aeb0..415a7ca2b8829 100644 --- a/include/linux/uidgid.h +++ b/include/linux/uidgid.h @@ -17,6 +17,7 @@ struct user_namespace; extern struct user_namespace init_user_ns; +struct uid_gid_map; typedef struct { uid_t val; @@ -138,6 +139,9 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) return from_kgid(ns, gid) != (gid_t) -1; } +u32 map_id_down(struct uid_gid_map *map, u32 id); +u32 map_id_up(struct uid_gid_map *map, u32 id); + #else static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid) @@ -186,6 +190,15 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) return gid_valid(gid); } +static inline u32 map_id_down(struct uid_gid_map *map, u32 id) +{ + return id; +} + +static inline u32 map_id_up(struct uid_gid_map *map, u32 id) +{ + return id; +} #endif /* CONFIG_USER_NS */ #endif /* _LINUX_UIDGID_H */ -- cgit v1.2.3 From e0894ff038d86f30614ec16ec26dacb88c8d2bd4 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Mon, 27 Nov 2023 12:05:21 +1300 Subject: platform/x86: asus-wmi: disable USB0 hub on ROG Ally before suspend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ASUS have worked around an issue in XInput where it doesn't support USB selective suspend, which causes suspend issues in Windows. They worked around this by adjusting the MCU firmware to disable the USB0 hub when the screen is switched off during the Microsoft DSM suspend path in ACPI. The issue we have with this however is one of timing - the call the tells the MCU to this isn't able to complete before suspend is done so we call this in a prepare() and add a small msleep() to ensure it is done. This must be done before the screen is switched off to prevent a variety of possible races. Further to this the MCU powersave option must also be disabled as it can cause a number of issues such as: - unreliable resume connection of N-Key - complete loss of N-Key if the power is plugged in while suspended Disabling the powersave option prevents this. Without this the MCU is unable to initialise itself correctly on resume. Signed-off-by: "Luke D. Jones" Tested-by: Philip Mueller Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20231126230521.125708-2-luke@ljones.dev Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/asus-wmi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 63e630276499f..ab1c7deff118f 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -114,6 +114,9 @@ /* Charging mode - 1=Barrel, 2=USB */ #define ASUS_WMI_DEVID_CHARGE_MODE 0x0012006C +/* MCU powersave mode */ +#define ASUS_WMI_DEVID_MCU_POWERSAVE 0x001200E2 + /* epu is connected? 1 == true */ #define ASUS_WMI_DEVID_EGPU_CONNECTED 0x00090018 /* egpu on/off */ -- cgit v1.2.3 From 083772c9f972dcc248913b52a0dec1025baa1e16 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 26 Nov 2023 15:07:30 -0800 Subject: net: page_pool: record pools per netdev Link the page pools with netdevs. This needs to be netns compatible so we have two options. Either we record the pools per netns and have to worry about moving them as the netdev gets moved. Or we record them directly on the netdev so they move with the netdev without any extra work. Implement the latter option. Since pools may outlast netdev we need a place to store orphans. In time honored tradition use loopback for this purpose. Reviewed-by: Mina Almasry Reviewed-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/list.h | 20 ++++++++++++++++++++ include/linux/netdevice.h | 4 ++++ include/linux/poison.h | 2 ++ 3 files changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 1837caedf7231..059aa1fff41e9 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -1119,6 +1119,26 @@ static inline void hlist_move_list(struct hlist_head *old, old->first = NULL; } +/** + * hlist_splice_init() - move all entries from one list to another + * @from: hlist_head from which entries will be moved + * @last: last entry on the @from list + * @to: hlist_head to which entries will be moved + * + * @to can be empty, @from must contain at least @last. + */ +static inline void hlist_splice_init(struct hlist_head *from, + struct hlist_node *last, + struct hlist_head *to) +{ + if (to->first) + to->first->pprev = &last->next; + last->next = to->first; + to->first = from->first; + from->first->pprev = &to->first; + from->first = NULL; +} + #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e87caa81f70ca..998c7aaa98b86 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2447,6 +2447,10 @@ struct net_device { #if IS_ENABLED(CONFIG_DPLL) struct dpll_pin *dpll_pin; #endif +#if IS_ENABLED(CONFIG_PAGE_POOL) + /** @page_pools: page pools created for this netdevice */ + struct hlist_head page_pools; +#endif }; #define to_net_dev(d) container_of(d, struct net_device, dev) diff --git a/include/linux/poison.h b/include/linux/poison.h index 851a855d38688..27a7dad17eefb 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -83,6 +83,8 @@ /********** net/core/skbuff.c **********/ #define SKB_LIST_POISON_NEXT ((void *)(0x800 + POISON_POINTER_DELTA)) +/********** net/ **********/ +#define NET_PTR_POISON ((void *)(0x801 + POISON_POINTER_DELTA)) /********** kernel/bpf/ **********/ #define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA)) -- cgit v1.2.3 From c392cbecd8eca4c53f2bf508731257d9d0a21c2d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 27 Nov 2023 16:47:04 -0700 Subject: io_uring/kbuf: defer release of mapped buffer rings If a provided buffer ring is setup with IOU_PBUF_RING_MMAP, then the kernel allocates the memory for it and the application is expected to mmap(2) this memory. However, io_uring uses remap_pfn_range() for this operation, so we cannot rely on normal munmap/release on freeing them for us. Stash an io_buf_free entry away for each of these, if any, and provide a helper to free them post ->release(). Cc: stable@vger.kernel.org Fixes: c56e022c0a27 ("io_uring: add support for user mapped provided buffer ring") Reported-by: Jann Horn Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index d3009d56af0ba..805bb635cdf55 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -340,6 +340,9 @@ struct io_ring_ctx { struct list_head io_buffers_cache; + /* deferred free list, protected by ->uring_lock */ + struct hlist_head io_buf_list; + /* Keep this last, we don't need it for the fast path */ struct wait_queue_head poll_wq; struct io_restriction restrictions; -- cgit v1.2.3 From 9a5f580c1c71b6aedba696c4898a7a7184cef8ad Mon Sep 17 00:00:00 2001 From: Muralidhara M K Date: Thu, 2 Nov 2023 11:42:24 +0000 Subject: EDAC/mc: Add support for HBM3 memory type AMD MI300A models use HBM3 (High Bandwidth Memory Gen 3) memory. HBM is a high-speed computer memory interface for 3D-stacked synchronous dynamic random-access memory (SDRAM). Signed-off-by: Muralidhara M K Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20231102114225.2006878-4-muralimk@amd.com --- include/linux/edac.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/edac.h b/include/linux/edac.h index fa4bda2a70f6c..1174beb94ab6d 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -187,6 +187,7 @@ static inline char *mc_event_error_type(const unsigned int err_type) * @MEM_NVDIMM: Non-volatile RAM * @MEM_WIO2: Wide I/O 2. * @MEM_HBM2: High bandwidth Memory Gen 2. + * @MEM_HBM3: High bandwidth Memory Gen 3. */ enum mem_type { MEM_EMPTY = 0, @@ -218,6 +219,7 @@ enum mem_type { MEM_NVDIMM, MEM_WIO2, MEM_HBM2, + MEM_HBM3, }; #define MEM_FLAG_EMPTY BIT(MEM_EMPTY) @@ -248,6 +250,7 @@ enum mem_type { #define MEM_FLAG_NVDIMM BIT(MEM_NVDIMM) #define MEM_FLAG_WIO2 BIT(MEM_WIO2) #define MEM_FLAG_HBM2 BIT(MEM_HBM2) +#define MEM_FLAG_HBM3 BIT(MEM_HBM3) /** * enum edac_type - Error Detection and Correction capabilities and mode -- cgit v1.2.3 From fad907cffd4bde7384812cf32fcf69becab805cc Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 28 Nov 2023 20:30:26 +0800 Subject: block: move .bd_inode into 1st cacheline of block_device The .bd_inode field of block_device is used in IO fast path of blkdev_write_iter() and blkdev_llseek(), so it is more efficient to keep it into the 1st cacheline. .bd_openers is only touched in open()/close(), and .bd_size_lock is only for updating bdev capacity, which is in slow path too. So swap .bd_inode layout with .bd_openers & .bd_size_lock to move .bd_inode into the 1st cache line. Cc: Yu Kuai Signed-off-by: Ming Lei Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231128123027.971610-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d5c5e59ddbd25..f7d40692dd948 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -49,9 +49,10 @@ struct block_device { bool bd_write_holder; bool bd_has_submit_bio; dev_t bd_dev; + struct inode *bd_inode; /* will die */ + atomic_t bd_openers; spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ - struct inode * bd_inode; /* will die */ void * bd_claiming; void * bd_holder; const struct blk_holder_ops *bd_holder_ops; -- cgit v1.2.3 From 67d995e069535c32829f5d368d919063492cec6e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 28 Nov 2023 20:30:27 +0800 Subject: block: warn once for each partition in bio_check_ro() Commit 1b0a151c10a6 ("blk-core: use pr_warn_ratelimited() in bio_check_ro()") fix message storm by limit the rate, however, there will still be lots of message in the long term. Fix it better by warn once for each partition. Signed-off-by: Yu Kuai Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231128123027.971610-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f7d40692dd948..b29ebd53417d7 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -70,6 +70,7 @@ struct block_device { #ifdef CONFIG_FAIL_MAKE_REQUEST bool bd_make_it_fail; #endif + bool bd_ro_warned; /* * keep this out-of-line as it's both big and not needed in the fast * path -- cgit v1.2.3 From a3db64c575ca201c9783f100c70b82d52bd78a93 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 27 Nov 2023 13:37:09 +0100 Subject: tty: make tty const in tty_get_baud_rate() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit 87888fb9ac0c ("tty: Remove baudrate dead code & make ktermios params const"), the 'tty' parameter is only read in tty_get_baud_rate(). Therefore, we can make 'tty' accepted in the function 'const' for clarity. The "the terminal bit flags may be updated." part of the tty_get_baud_rate()'s kernel-doc is dropped as it is no longer true. Because of the same commit above. And it was misplaced anyway. Signed-off-by: "Jiri Slaby (SUSE)" Cc: Ilpo Järvinen Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231127123713.14504-1-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index 7625fc98fef3f..e96c85f4f91ed 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -440,10 +440,9 @@ void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud, * * Returns: the baud rate as an integer for this terminal * - * Locking: The termios lock must be held by the caller and the terminal bit - * flags may be updated. + * Locking: The termios lock must be held by the caller. */ -static inline speed_t tty_get_baud_rate(struct tty_struct *tty) +static inline speed_t tty_get_baud_rate(const struct tty_struct *tty) { return tty_termios_baud_rate(&tty->termios); } -- cgit v1.2.3 From eec4954b81c3d9a38b99e78afb553c359db40093 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 28 Nov 2023 10:28:15 +0000 Subject: driver core: make device_is_dependent() static The function device_is_dependent() is only called by the driver core internally and should not, at this time, be called by anyone else outside of it, so mark it as static so as not to give driver authors the wrong idea. Cc: Saravana Kannan Acked-by: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/2023112815-faculty-thud-add8@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index d7a72a8749ea0..4aa34c8d13610 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1071,7 +1071,6 @@ int device_rename(struct device *dev, const char *new_name); int device_move(struct device *dev, struct device *new_parent, enum dpm_order dpm_order); int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid); -int device_is_dependent(struct device *dev, void *target); static inline bool device_supports_offline(struct device *dev) { -- cgit v1.2.3 From 5431fdd2c181dd2eac218e45b44deb2925fa48f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 17 Sep 2023 13:24:21 +0200 Subject: ptrace: Convert ptrace_attach() to use lock guards Created as testing for the conditional guard infrastructure. Specifically this makes use of the following form: scoped_cond_guard (mutex_intr, return -ERESTARTNOINTR, &task->signal->cred_guard_mutex) { ... } ... return 0; Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20231102110706.568467727%40infradead.org --- include/linux/sched/task.h | 2 ++ include/linux/spinlock.h | 26 ++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index a23af225c8983..4f3dca3535568 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -226,4 +226,6 @@ static inline void task_unlock(struct task_struct *p) spin_unlock(&p->alloc_lock); } +DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T)) + #endif /* _LINUX_SCHED_TASK_H */ diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index ceb56b39c70f7..90bc853cafb6a 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -548,5 +548,31 @@ DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t, DEFINE_LOCK_GUARD_1_COND(spinlock_irqsave, _try, spin_trylock_irqsave(_T->lock, _T->flags)) +DEFINE_LOCK_GUARD_1(read_lock, rwlock_t, + read_lock(_T->lock), + read_unlock(_T->lock)) + +DEFINE_LOCK_GUARD_1(read_lock_irq, rwlock_t, + read_lock_irq(_T->lock), + read_unlock_irq(_T->lock)) + +DEFINE_LOCK_GUARD_1(read_lock_irqsave, rwlock_t, + read_lock_irqsave(_T->lock, _T->flags), + read_unlock_irqrestore(_T->lock, _T->flags), + unsigned long flags) + +DEFINE_LOCK_GUARD_1(write_lock, rwlock_t, + write_lock(_T->lock), + write_unlock(_T->lock)) + +DEFINE_LOCK_GUARD_1(write_lock_irq, rwlock_t, + write_lock_irq(_T->lock), + write_unlock_irq(_T->lock)) + +DEFINE_LOCK_GUARD_1(write_lock_irqsave, rwlock_t, + write_lock_irqsave(_T->lock, _T->flags), + write_unlock_irqrestore(_T->lock, _T->flags), + unsigned long flags) + #undef __LINUX_INSIDE_SPINLOCK_H #endif /* __LINUX_SPINLOCK_H */ -- cgit v1.2.3 From febab20caebac959fdc3d7520bc52de8b1184455 Mon Sep 17 00:00:00 2001 From: Wyes Karny Date: Fri, 17 Nov 2023 06:38:39 +0000 Subject: cpufreq/amd-pstate: Fix scaling_min_freq and scaling_max_freq update When amd_pstate is running, writing to scaling_min_freq and scaling_max_freq has no effect. These values are only passed to the policy level, but not to the platform level. This means that the platform does not know about the frequency limits set by the user. To fix this, update the min_perf and max_perf values at the platform level whenever the user changes the scaling_min_freq and scaling_max_freq values. Fixes: ffa5096a7c33 ("cpufreq: amd-pstate: implement Pstate EPP support for the AMD processors") Acked-by: Huang Rui Signed-off-by: Wyes Karny Signed-off-by: Rafael J. Wysocki --- include/linux/amd-pstate.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h index 446394f846064..6ad02ad9c7b42 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h @@ -70,6 +70,10 @@ struct amd_cpudata { u32 nominal_perf; u32 lowest_nonlinear_perf; u32 lowest_perf; + u32 min_limit_perf; + u32 max_limit_perf; + u32 min_limit_freq; + u32 max_limit_freq; u32 max_freq; u32 min_freq; -- cgit v1.2.3 From 48eb03dd26304c24f03bdbb9382e89c8564e71df Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 27 Nov 2023 11:03:08 -0800 Subject: xsk: Add TX timestamp and TX checksum offload support This change actually defines the (initial) metadata layout that should be used by AF_XDP userspace (xsk_tx_metadata). The first field is flags which requests appropriate offloads, followed by the offload-specific fields. The supported per-device offloads are exported via netlink (new xsk-flags). The offloads themselves are still implemented in a bit of a framework-y fashion that's left from my initial kfunc attempt. I'm introducing new xsk_tx_metadata_ops which drivers are supposed to implement. The drivers are also supposed to call xsk_tx_metadata_request/xsk_tx_metadata_complete in the right places. Since xsk_tx_metadata_{request,_complete} are static inline, we don't incur any extra overhead doing indirect calls. The benefit of this scheme is as follows: - keeps all metadata layout parsing away from driver code - makes it easy to grep and see which drivers implement what - don't need any extra flags to maintain to keep track of what offloads are implemented; if the callback is implemented - the offload is supported (used by netlink reporting code) Two offloads are defined right now: 1. XDP_TXMD_FLAGS_CHECKSUM: skb-style csum_start+csum_offset 2. XDP_TXMD_FLAGS_TIMESTAMP: writes TX timestamp back into metadata area upon completion (tx_timestamp field) XDP_TXMD_FLAGS_TIMESTAMP is also implemented for XDP_COPY mode: it writes SW timestamp from the skb destructor (note I'm reusing hwtstamps to pass metadata pointer). The struct is forward-compatible and can be extended in the future by appending more fields. Reviewed-by: Song Yoong Siang Signed-off-by: Stanislav Fomichev Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20231127190319.1190813-3-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/linux/netdevice.h | 2 ++ include/linux/skbuff.h | 14 +++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e87caa81f70ca..08da8b28c8164 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1865,6 +1865,7 @@ enum netdev_stat_type { * @netdev_ops: Includes several pointers to callbacks, * if one wants to override the ndo_*() functions * @xdp_metadata_ops: Includes pointers to XDP metadata callbacks. + * @xsk_tx_metadata_ops: Includes pointers to AF_XDP TX metadata callbacks. * @ethtool_ops: Management operations * @l3mdev_ops: Layer 3 master device operations * @ndisc_ops: Includes callbacks for different IPv6 neighbour @@ -2128,6 +2129,7 @@ struct net_device { unsigned long long priv_flags; const struct net_device_ops *netdev_ops; const struct xdp_metadata_ops *xdp_metadata_ops; + const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops; int ifindex; unsigned short gflags; unsigned short hard_header_len; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 27998f73183e1..b370eb8d70f7f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -566,6 +566,15 @@ struct ubuf_info_msgzc { int mm_account_pinned_pages(struct mmpin *mmp, size_t size); void mm_unaccount_pinned_pages(struct mmpin *mmp); +/* Preserve some data across TX submission and completion. + * + * Note, this state is stored in the driver. Extending the layout + * might need some special care. + */ +struct xsk_tx_metadata_compl { + __u64 *tx_timestamp; +}; + /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ @@ -578,7 +587,10 @@ struct skb_shared_info { /* Warning: this field is not always filled in (UFO)! */ unsigned short gso_segs; struct sk_buff *frag_list; - struct skb_shared_hwtstamps hwtstamps; + union { + struct skb_shared_hwtstamps hwtstamps; + struct xsk_tx_metadata_compl xsk_meta; + }; unsigned int gso_type; u32 tskey; -- cgit v1.2.3 From 8866730aed5100f06d3d965c22f1c61f74942541 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 28 Nov 2023 17:25:56 -0800 Subject: bpf, sockmap: af_unix stream sockets need to hold ref for pair sock AF_UNIX stream sockets are a paired socket. So sending on one of the pairs will lookup the paired socket as part of the send operation. It is possible however to put just one of the pairs in a BPF map. This currently increments the refcnt on the sock in the sockmap to ensure it is not free'd by the stack before sockmap cleans up its state and stops any skbs being sent/recv'd to that socket. But we missed a case. If the peer socket is closed it will be free'd by the stack. However, the paired socket can still be referenced from BPF sockmap side because we hold a reference there. Then if we are sending traffic through BPF sockmap to that socket it will try to dereference the free'd pair in its send logic creating a use after free. And following splat: [59.900375] BUG: KASAN: slab-use-after-free in sk_wake_async+0x31/0x1b0 [59.901211] Read of size 8 at addr ffff88811acbf060 by task kworker/1:2/954 [...] [59.905468] Call Trace: [59.905787] [59.906066] dump_stack_lvl+0x130/0x1d0 [59.908877] print_report+0x16f/0x740 [59.910629] kasan_report+0x118/0x160 [59.912576] sk_wake_async+0x31/0x1b0 [59.913554] sock_def_readable+0x156/0x2a0 [59.914060] unix_stream_sendmsg+0x3f9/0x12a0 [59.916398] sock_sendmsg+0x20e/0x250 [59.916854] skb_send_sock+0x236/0xac0 [59.920527] sk_psock_backlog+0x287/0xaa0 To fix let BPF sockmap hold a refcnt on both the socket in the sockmap and its paired socket. It wasn't obvious how to contain the fix to bpf_unix logic. The primarily problem with keeping this logic in bpf_unix was: In the sock close() we could handle the deref by having a close handler. But, when we are destroying the psock through a map delete operation we wouldn't have gotten any signal thorugh the proto struct other than it being replaced. If we do the deref from the proto replace its too early because we need to deref the sk_pair after the backlog worker has been stopped. Given all this it seems best to just cache it at the end of the psock and eat 8B for the af_unix and vsock users. Notice dgram sockets are OK because they handle locking already. Fixes: 94531cfcbe79 ("af_unix: Add unix_stream_proto for sockmap") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20231129012557.95371-2-john.fastabend@gmail.com --- include/linux/skmsg.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index c1637515a8a41..c953b8c0d2f43 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -106,6 +106,7 @@ struct sk_psock { struct mutex work_mutex; struct sk_psock_work_state work_state; struct delayed_work work; + struct sock *sk_pair; struct rcu_work rwork; }; -- cgit v1.2.3 From 7577bc8249c3fc86096ef1b1c9a8f4b6232231e7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 28 Nov 2023 18:29:20 -0800 Subject: tcp: Don't pass cookie to __cookie_v[46]_check(). tcp_hdr(skb) and SYN Cookie are passed to __cookie_v[46]_check(), but none of the callers passes cookie other than ntohl(th->ack_seq) - 1. Let's fetch it in __cookie_v[46]_check() instead of passing the cookie over and over. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Simon Horman Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20231129022924.96156-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/linux/netfilter_ipv6.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 7834c0be2831d..61aa48f46dd72 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -51,7 +51,7 @@ struct nf_ipv6_ops { u32 (*cookie_init_sequence)(const struct ipv6hdr *iph, const struct tcphdr *th, u16 *mssp); int (*cookie_v6_check)(const struct ipv6hdr *iph, - const struct tcphdr *th, __u32 cookie); + const struct tcphdr *th); #endif void (*route_input)(struct sk_buff *skb); int (*fragment)(struct net *net, struct sock *sk, struct sk_buff *skb, @@ -179,16 +179,16 @@ static inline u32 nf_ipv6_cookie_init_sequence(const struct ipv6hdr *iph, } static inline int nf_cookie_v6_check(const struct ipv6hdr *iph, - const struct tcphdr *th, __u32 cookie) + const struct tcphdr *th) { #if IS_ENABLED(CONFIG_SYN_COOKIES) #if IS_MODULE(CONFIG_IPV6) const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); if (v6_ops) - return v6_ops->cookie_v6_check(iph, th, cookie); + return v6_ops->cookie_v6_check(iph, th); #elif IS_BUILTIN(CONFIG_IPV6) - return __cookie_v6_check(iph, th, cookie); + return __cookie_v6_check(iph, th); #endif #endif return 0; -- cgit v1.2.3 From 39cefd85098d12439586824c39f8e1948fac186d Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 29 Nov 2023 17:31:55 +0100 Subject: spi: introduce SPI_TRANS_FAIL_IO for error reporting The default message transfer implementation - spi_transfer_one_message - invokes the specific device driver's transfer_one(), then waits for completion. However, there is no mechanism for the device driver to report failure in the middle of the transfer. Introduce SPI_TRANS_FAIL_IO for drivers to report transfer failure. Signed-off-by: Nam Cao Acked-by: Linus Walleij Link: https://lore.kernel.org/r/4b420dac528e60f122adde16851da88e4798c1ea.1701274975.git.namcao@linutronix.de Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 255a0562aea5a..aa25ae04c5c37 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -461,10 +461,13 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * - return 1 if the transfer is still in progress. When * the driver is finished with this transfer it must * call spi_finalize_current_transfer() so the subsystem - * can issue the next transfer. Note: transfer_one and - * transfer_one_message are mutually exclusive; when both - * are set, the generic subsystem does not call your - * transfer_one callback. + * can issue the next transfer. If the transfer fails, the + * driver must set the flag SPI_TRANS_FAIL_IO to + * spi_transfer->error first, before calling + * spi_finalize_current_transfer(). + * Note: transfer_one and transfer_one_message are mutually + * exclusive; when both are set, the generic subsystem does + * not call your transfer_one callback. * @handle_err: the subsystem calls the driver to handle an error that occurs * in the generic implementation of transfer_one_message(). * @mem_ops: optimized/dedicated operations for interactions with SPI memory. @@ -1040,6 +1043,7 @@ struct spi_transfer { unsigned len; #define SPI_TRANS_FAIL_NO_START BIT(0) +#define SPI_TRANS_FAIL_IO BIT(1) u16 error; dma_addr_t tx_dma; -- cgit v1.2.3 From cff49d58f57e5667c10a0db85d7461790bb85cf8 Mon Sep 17 00:00:00 2001 From: "Chia-Lin Kao (AceLan)" Date: Wed, 29 Nov 2023 14:43:10 +0800 Subject: spi: Unify error codes by replacing -ENOTSUPP with -EOPNOTSUPP This commit updates the SPI subsystem, particularly affecting "SPI MEM" drivers and core parts, by replacing the -ENOTSUPP error code with -EOPNOTSUPP. The key motivations for this change are as follows: 1. The spi-nor driver currently uses EOPNOTSUPP, whereas calls to spi-mem might return ENOTSUPP. This update aims to unify the error reporting within the SPI subsystem for clarity and consistency. 2. The use of ENOTSUPP has been flagged by checkpatch as inappropriate, mainly being reserved for NFS-related errors. To align with kernel coding standards and recommendations, this change is being made. 3. By using EOPNOTSUPP, we provide more specific context to the error, indicating that a particular operation is not supported. This helps differentiate from the more generic ENOTSUPP error, allowing drivers to better handle and respond to different error scenarios. Risks and Considerations: While this change is primarily intended as a code cleanup and error code unification, there is a minor risk of breaking user-space applications that rely on specific return codes for unsupported operations. However, this risk is considered low, as such use-cases are unlikely to be common or critical. Nevertheless, developers and users should be aware of this change, especially if they have scripts or tools that specifically handle SPI error codes. This commit does not introduce any functional changes to the SPI subsystem or the affected drivers. Signed-off-by: "Chia-Lin Kao (AceLan)" Acked-by: Tudor Ambarus Reviewed-by: Mika Westerberg Acked-by: Miquel Raynal Acked-by: Michael Walle Link: https://lore.kernel.org/r/20231129064311.272422-1-acelan.kao@canonical.com Signed-off-by: Mark Brown --- include/linux/spi/spi-mem.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index 6b0a7dc48a4b7..f866d5c8ed32a 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -233,6 +233,8 @@ static inline void *spi_mem_get_drvdata(struct spi_mem *mem) * limitations) * @supports_op: check if an operation is supported by the controller * @exec_op: execute a SPI memory operation + * not all driver provides supports_op(), so it can return -EOPNOTSUPP + * if the op is not supported by the driver/controller * @get_name: get a custom name for the SPI mem device from the controller. * This might be needed if the controller driver has been ported * to use the SPI mem layer and a custom name is used to keep -- cgit v1.2.3 From 4ea95c04fa6b9043a1a301240996aeebe3cb28ec Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 29 Nov 2023 16:10:00 -0800 Subject: vfio: Drop vfio_file_iommu_group() stub to fudge around a KVM wart Drop the vfio_file_iommu_group() stub and instead unconditionally declare the function to fudge around a KVM wart where KVM tries to do symbol_get() on vfio_file_iommu_group() (and other VFIO symbols) even if CONFIG_VFIO=n. Ensuring the symbol is always declared fixes a PPC build error when modules are also disabled, in which case symbol_get() simply points at the address of the symbol (with some attributes shenanigans). Because KVM does symbol_get() instead of directly depending on VFIO, the lack of a fully defined symbol is not problematic (ugly, but "fine"). arch/powerpc/kvm/../../../virt/kvm/vfio.c:89:7: error: attribute declaration must precede definition [-Werror,-Wignored-attributes] fn = symbol_get(vfio_file_iommu_group); ^ include/linux/module.h:805:60: note: expanded from macro 'symbol_get' #define symbol_get(x) ({ extern typeof(x) x __attribute__((weak,visibility("hidden"))); &(x); }) ^ include/linux/vfio.h:294:35: note: previous definition is here static inline struct iommu_group *vfio_file_iommu_group(struct file *file) ^ arch/powerpc/kvm/../../../virt/kvm/vfio.c:89:7: error: attribute declaration must precede definition [-Werror,-Wignored-attributes] fn = symbol_get(vfio_file_iommu_group); ^ include/linux/module.h:805:65: note: expanded from macro 'symbol_get' #define symbol_get(x) ({ extern typeof(x) x __attribute__((weak,visibility("hidden"))); &(x); }) ^ include/linux/vfio.h:294:35: note: previous definition is here static inline struct iommu_group *vfio_file_iommu_group(struct file *file) ^ 2 errors generated. Although KVM is firmly in the wrong (there is zero reason for KVM to build virt/kvm/vfio.c when VFIO is disabled), fudge around the error in VFIO as the stub is unnecessary and doesn't serve its intended purpose (KVM is the only external user of vfio_file_iommu_group()), and there is an in-flight series to clean up the entire KVM<->VFIO interaction, i.e. fixing this in KVM would result in more churn in the long run, and the stub needs to go away regardless. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202308251949.5IiaV0sz-lkp@intel.com Closes: https://lore.kernel.org/oe-kbuild-all/202309030741.82aLACDG-lkp@intel.com Closes: https://lore.kernel.org/oe-kbuild-all/202309110914.QLH0LU6L-lkp@intel.com Link: https://lore.kernel.org/all/0-v1-08396538817d+13c5-vfio_kvm_kconfig_jgg@nvidia.com Link: https://lore.kernel.org/all/20230916003118.2540661-1-seanjc@google.com Cc: Nick Desaulniers Cc: Jason Gunthorpe Tested-by: Michael Ellerman Fixes: c1cce6d079b8 ("vfio: Compile vfio_group infrastructure optionally") Signed-off-by: Sean Christopherson Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231130001000.543240-1-seanjc@google.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 454e9295970c4..a65b2513f8cdc 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -289,16 +289,12 @@ void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes, /* * External user API */ -#if IS_ENABLED(CONFIG_VFIO_GROUP) struct iommu_group *vfio_file_iommu_group(struct file *file); + +#if IS_ENABLED(CONFIG_VFIO_GROUP) bool vfio_file_is_group(struct file *file); bool vfio_file_has_dev(struct file *file, struct vfio_device *device); #else -static inline struct iommu_group *vfio_file_iommu_group(struct file *file) -{ - return NULL; -} - static inline bool vfio_file_is_group(struct file *file) { return false; -- cgit v1.2.3 From 3717194f249227a3dfd8433bd9374cc7e0cf823d Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 29 Nov 2023 13:06:15 +0200 Subject: Input: gpio-keys - add system suspend support for dedicated wakeirqs Some SoCs have a separate dedicated wake-up interrupt controller that can be used to wake up the system from deeper idle states. We already support configuring a separate interrupt for a gpio-keys button to be used with a gpio line. However, we are lacking support system suspend for cases where a separate interrupt needs to be used in deeper sleep modes. Because of it's nature, gpio-keys does not know about the runtime PM state of the button gpios, and may have several gpio buttons configured for each gpio-keys device instance. Implementing runtime PM support for gpio-keys does not help, and we cannot use drivers/base/power/wakeirq.c support. We need to implement custom wakeirq support for gpio-keys. For handling a dedicated wakeirq for system suspend, we enable and disable it with gpio_keys_enable_wakeup() and gpio_keys_disable_wakeup() that we already use based on device_may_wakeup(). Some systems may have a dedicated wakeirq that can also be used as the main interrupt, this is already working for gpio-keys. Let's add some wakeirq related comments while at it as the usage with a gpio line and separate interrupt line may not be obvious. Tested-by: Dhruva Gole Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20231129110618.27551-2-tony@atomide.com Signed-off-by: Dmitry Torokhov --- include/linux/gpio_keys.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio_keys.h b/include/linux/gpio_keys.h index 3f84aeb81e480..80fa930b04c67 100644 --- a/include/linux/gpio_keys.h +++ b/include/linux/gpio_keys.h @@ -21,6 +21,7 @@ struct device; * disable button via sysfs * @value: axis value for %EV_ABS * @irq: Irq number in case of interrupt keys + * @wakeirq: Optional dedicated wake-up interrupt */ struct gpio_keys_button { unsigned int code; @@ -34,6 +35,7 @@ struct gpio_keys_button { bool can_disable; int value; unsigned int irq; + unsigned int wakeirq; }; /** -- cgit v1.2.3 From da2e08d4630ab04ee5b61515fe423c582b5c3be2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 3 Oct 2023 16:18:38 -0700 Subject: i40e: Annotate struct i40e_qvlist_info with __counted_by Prepare for the coming implementation by GCC and Clang of the __counted_by attribute. Flexible array members annotated with __counted_by can have their accesses bounds-checked at run-time via CONFIG_UBSAN_BOUNDS (for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family functions). As found with Coccinelle[1], add __counted_by for struct i40e_qvlist_info. Cc: Tony Nguyen Cc: Shiraz Saleem Cc: Jakub Kicinski Cc: Jesse Brandeburg Cc: Gurucharan G Cc: "Gustavo A. R. Silva" Link: https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci [1] Reviewed-by: "Gustavo A. R. Silva" Link: https://lore.kernel.org/r/20231003231838.work.510-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/net/intel/i40e_client.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/net/intel/i40e_client.h b/include/linux/net/intel/i40e_client.h index ed42bd5f639f2..0aa4411528fc5 100644 --- a/include/linux/net/intel/i40e_client.h +++ b/include/linux/net/intel/i40e_client.h @@ -45,7 +45,7 @@ struct i40e_qv_info { struct i40e_qvlist_info { u32 num_vectors; - struct i40e_qv_info qv_info[]; + struct i40e_qv_info qv_info[] __counted_by(num_vectors); }; -- cgit v1.2.3 From 6a3afb6ac6dfab158ebdd4b87941178f58c8939f Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 29 Nov 2023 19:47:40 +0800 Subject: jbd2: increase the journal IO's priority Current jbd2 only add REQ_SYNC for descriptor block, metadata log buffer, commit buffer and superblock buffer, the submitted IO could be throttled by writeback throttle in block layer, that could lead to priority inversion in some cases. The log IO looks like a kind of high priority metadata IO, so it should not be throttled by WBT like QOS policies in block layer, let's add REQ_SYNC | REQ_IDLE to exempt from writeback throttle, and also add REQ_META together indicates it's a metadata IO. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231129114740.2686201-2-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 6dcbb4eb80fb2..beb30719ee161 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1374,6 +1374,9 @@ JBD2_FEATURE_INCOMPAT_FUNCS(csum2, CSUM_V2) JBD2_FEATURE_INCOMPAT_FUNCS(csum3, CSUM_V3) JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) +/* Journal high priority write IO operation flags */ +#define JBD2_JOURNAL_REQ_FLAGS (REQ_META | REQ_SYNC | REQ_IDLE) + /* * Journal flag definitions */ -- cgit v1.2.3 From d839a656d0f3caca9f96e9bf912fd394ac6a11bc Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Fri, 1 Dec 2023 14:53:55 +0900 Subject: kprobes: consistent rcu api usage for kretprobe holder It seems that the pointer-to-kretprobe "rp" within the kretprobe_holder is RCU-managed, based on the (non-rethook) implementation of get_kretprobe(). The thought behind this patch is to make use of the RCU API where possible when accessing this pointer so that the needed barriers are always in place and to self-document the code. The __rcu annotation to "rp" allows for sparse RCU checking. Plain writes done to the "rp" pointer are changed to make use of the RCU macro for assignment. For the single read, the implementation of get_kretprobe() is simplified by making use of an RCU macro which accomplishes the same, but note that the log warning text will be more generic. I did find that there is a difference in assembly generated between the usage of the RCU macros vs without. For example, on arm64, when using rcu_assign_pointer(), the corresponding store instruction is a store-release (STLR) which has an implicit barrier. When normal assignment is done, a regular store (STR) is found. In the macro case, this seems to be a result of rcu_assign_pointer() using smp_store_release() when the value to write is not NULL. Link: https://lore.kernel.org/all/20231122132058.3359-1-inwardvessel@gmail.com/ Fixes: d741bf41d7c7 ("kprobes: Remove kretprobe hash") Cc: stable@vger.kernel.org Signed-off-by: JP Kobryn Acked-by: Masami Hiramatsu (Google) Signed-off-by: Masami Hiramatsu (Google) --- include/linux/kprobes.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index ab1da3142b06a..64672bace5609 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -139,7 +139,7 @@ static inline bool kprobe_ftrace(struct kprobe *p) * */ struct kretprobe_holder { - struct kretprobe *rp; + struct kretprobe __rcu *rp; struct objpool_head pool; }; @@ -245,10 +245,7 @@ unsigned long kretprobe_trampoline_handler(struct pt_regs *regs, static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance *ri) { - RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(), - "Kretprobe is accessed from instance under preemptive context"); - - return READ_ONCE(ri->rph->rp); + return rcu_dereference_check(ri->rph->rp, rcu_read_lock_any_held()); } static nokprobe_inline unsigned long get_kretprobe_retaddr(struct kretprobe_instance *ri) -- cgit v1.2.3 From a1461f1fd6cfdc4b8917c9d4a91e92605d1f28dc Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Fri, 1 Dec 2023 14:53:56 +0900 Subject: rethook: Use __rcu pointer for rethook::handler Since the rethook::handler is an RCU-maganged pointer so that it will notice readers the rethook is stopped (unregistered) or not, it should be an __rcu pointer and use appropriate functions to be accessed. This will use appropriate memory barrier when accessing it. OTOH, rethook::data is never changed, so we don't need to check it in get_kretprobe(). NOTE: To avoid sparse warning, rethook::handler is defined by a raw function pointer type with __rcu instead of rethook_handler_t. Link: https://lore.kernel.org/all/170126066201.398836.837498688669005979.stgit@devnote2/ Fixes: 54ecbe6f1ed5 ("rethook: Add a generic return hook") Cc: stable@vger.kernel.org Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311241808.rv9ceuAh-lkp@intel.com/ Tested-by: JP Kobryn Signed-off-by: Masami Hiramatsu (Google) --- include/linux/kprobes.h | 6 ++---- include/linux/rethook.h | 7 ++++++- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 64672bace5609..0ff44d6633e33 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -197,10 +197,8 @@ extern int arch_trampoline_kprobe(struct kprobe *p); #ifdef CONFIG_KRETPROBE_ON_RETHOOK static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance *ri) { - RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(), - "Kretprobe is accessed from instance under preemptive context"); - - return (struct kretprobe *)READ_ONCE(ri->node.rethook->data); + /* rethook::data is non-changed field, so that you can access it freely. */ + return (struct kretprobe *)ri->node.rethook->data; } static nokprobe_inline unsigned long get_kretprobe_retaddr(struct kretprobe_instance *ri) { diff --git a/include/linux/rethook.h b/include/linux/rethook.h index ce69b2b7bc358..ba60962805f6d 100644 --- a/include/linux/rethook.h +++ b/include/linux/rethook.h @@ -28,7 +28,12 @@ typedef void (*rethook_handler_t) (struct rethook_node *, void *, unsigned long, */ struct rethook { void *data; - rethook_handler_t handler; + /* + * To avoid sparse warnings, this uses a raw function pointer with + * __rcu, instead of rethook_handler_t. But this must be same as + * rethook_handler_t. + */ + void (__rcu *handler) (struct rethook_node *, void *, unsigned long, struct pt_regs *); struct objpool_head pool; struct rcu_head rcu; }; -- cgit v1.2.3 From df16c1c51d8166958f533c0c886766f7ee9dd50f Mon Sep 17 00:00:00 2001 From: Andrew Halaney Date: Mon, 27 Nov 2023 15:41:10 -0600 Subject: net: phy: mdio_device: Reset device only when necessary Currently the phy reset sequence is as shown below for a devicetree described mdio phy on boot: 1. Assert the phy_device's reset as part of registering 2. Deassert the phy_device's reset as part of registering 3. Deassert the phy_device's reset as part of phy_probe 4. Deassert the phy_device's reset as part of phy_hw_init The extra two deasserts include waiting the deassert delay afterwards, which is adding unnecessary delay. This applies to both possible types of resets (reset controller reference and a reset gpio) that can be used. Here's some snipped tracing output using the following command line params "trace_event=gpio:* trace_options=stacktrace" illustrating the reset handling and where its coming from: /* Assert */ systemd-udevd-283 [002] ..... 6.780434: gpio_value: 544 set 0 systemd-udevd-283 [002] ..... 6.783849: => gpiod_set_raw_value_commit => gpiod_set_value_nocheck => gpiod_set_value_cansleep => mdio_device_reset => mdiobus_register_device => phy_device_register => fwnode_mdiobus_phy_device_register => fwnode_mdiobus_register_phy => __of_mdiobus_register => stmmac_mdio_register => stmmac_dvr_probe => stmmac_pltfr_probe => devm_stmmac_pltfr_probe => qcom_ethqos_probe => platform_probe /* Deassert */ systemd-udevd-283 [002] ..... 6.802480: gpio_value: 544 set 1 systemd-udevd-283 [002] ..... 6.805886: => gpiod_set_raw_value_commit => gpiod_set_value_nocheck => gpiod_set_value_cansleep => mdio_device_reset => phy_device_register => fwnode_mdiobus_phy_device_register => fwnode_mdiobus_register_phy => __of_mdiobus_register => stmmac_mdio_register => stmmac_dvr_probe => stmmac_pltfr_probe => devm_stmmac_pltfr_probe => qcom_ethqos_probe => platform_probe /* Deassert */ systemd-udevd-283 [002] ..... 6.882601: gpio_value: 544 set 1 systemd-udevd-283 [002] ..... 6.886014: => gpiod_set_raw_value_commit => gpiod_set_value_nocheck => gpiod_set_value_cansleep => mdio_device_reset => phy_probe => really_probe => __driver_probe_device => driver_probe_device => __device_attach_driver => bus_for_each_drv => __device_attach => device_initial_probe => bus_probe_device => device_add => phy_device_register => fwnode_mdiobus_phy_device_register => fwnode_mdiobus_register_phy => __of_mdiobus_register => stmmac_mdio_register => stmmac_dvr_probe => stmmac_pltfr_probe => devm_stmmac_pltfr_probe => qcom_ethqos_probe => platform_probe /* Deassert */ NetworkManager-477 [000] ..... 7.023144: gpio_value: 544 set 1 NetworkManager-477 [000] ..... 7.026596: => gpiod_set_raw_value_commit => gpiod_set_value_nocheck => gpiod_set_value_cansleep => mdio_device_reset => phy_init_hw => phy_attach_direct => phylink_fwnode_phy_connect => __stmmac_open => stmmac_open There's a lot of paths where the device is getting its reset asserted and deasserted. Let's track the state and only actually do the assert/deassert when it changes. Reported-by: Sagar Cheluvegowda Signed-off-by: Andrew Halaney Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20231127-net-phy-reset-once-v2-1-448e8658779e@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/mdio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 007fd9c3e4b62..79ceee3c8673e 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -38,6 +38,7 @@ struct mdio_device { /* Bus address of the MDIO device (0-31) */ int addr; int flags; + int reset_state; struct gpio_desc *reset_gpio; struct reset_control *reset_ctrl; unsigned int reset_assert_delay; -- cgit v1.2.3 From 7232522e6cafdf466ed7649c14546fd07ccc1978 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 30 Nov 2023 18:56:18 +0200 Subject: fanotify: store fsid in mark instead of in connector Some filesystems like fuse and nfs have zero or non-unique fsid. We would like to avoid reporting ambiguous fsid in events, so we need to avoid marking objects with same fsid and different sb. To make this easier to enforce, store the fsid in the marks of the group instead of in the shared conenctor. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Message-Id: <20231130165619.3386452-2-amir73il@gmail.com> --- include/linux/fsnotify_backend.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index c0892d75ce333..a80b525ca6538 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -472,10 +472,8 @@ typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t; struct fsnotify_mark_connector { spinlock_t lock; unsigned short type; /* Type of object [lock] */ -#define FSNOTIFY_CONN_FLAG_HAS_FSID 0x01 #define FSNOTIFY_CONN_FLAG_HAS_IREF 0x02 unsigned short flags; /* flags [lock] */ - __kernel_fsid_t fsid; /* fsid of filesystem containing object */ union { /* Object pointer [lock] */ fsnotify_connp_t *obj; @@ -530,6 +528,7 @@ struct fsnotify_mark { #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x0100 #define FSNOTIFY_MARK_FLAG_NO_IREF 0x0200 #define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS 0x0400 +#define FSNOTIFY_MARK_FLAG_HAS_FSID 0x0800 unsigned int flags; /* flags [mark->lock] */ }; @@ -763,11 +762,10 @@ extern struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp, /* attach the mark to the object */ extern int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int obj_type, - int add_flags, __kernel_fsid_t *fsid); + int add_flags); extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_connp_t *connp, - unsigned int obj_type, int add_flags, - __kernel_fsid_t *fsid); + unsigned int obj_type, int add_flags); /* attach the mark to the inode */ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark, @@ -775,15 +773,14 @@ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark, int add_flags) { return fsnotify_add_mark(mark, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, add_flags, NULL); + FSNOTIFY_OBJ_TYPE_INODE, add_flags); } static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark, struct inode *inode, int add_flags) { return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks, - FSNOTIFY_OBJ_TYPE_INODE, add_flags, - NULL); + FSNOTIFY_OBJ_TYPE_INODE, add_flags); } /* given a group and a mark, flag mark to be freed when all references are dropped */ -- cgit v1.2.3 From 30ad1938326bf9303ca38090339d948975a626f5 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 30 Nov 2023 18:56:19 +0200 Subject: fanotify: allow "weak" fsid when watching a single filesystem So far, fanotify returns -ENODEV or -EXDEV when trying to set a mark on a filesystem with a "weak" fsid, namely, zero fsid (e.g. fuse), or non-uniform fsid (e.g. btrfs non-root subvol). When group is watching inodes all from the same filesystem (or subvol), allow adding inode marks with "weak" fsid, because there is no ambiguity regarding which filesystem reports the event. The first mark added to a group determines if this group is single or multi filesystem, depending on the fsid at the path of the added mark. If the first mark added has a "strong" fsid, marks with "weak" fsid cannot be added and vice versa. If the first mark added has a "weak" fsid, following marks must have the same "weak" fsid and the same sb as the first mark. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Message-Id: <20231130165619.3386452-3-amir73il@gmail.com> --- include/linux/fsnotify_backend.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index a80b525ca6538..7f63be5ca0f1d 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -529,6 +529,7 @@ struct fsnotify_mark { #define FSNOTIFY_MARK_FLAG_NO_IREF 0x0200 #define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS 0x0400 #define FSNOTIFY_MARK_FLAG_HAS_FSID 0x0800 +#define FSNOTIFY_MARK_FLAG_WEAK_FSID 0x1000 unsigned int flags; /* flags [mark->lock] */ }; -- cgit v1.2.3 From c66272a4c9932d6c585eef99039747617d48d662 Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Sat, 25 Nov 2023 19:50:10 +0800 Subject: crypto: hisilicon/qm - simplify the status of qm The 'QM_INIT' and 'QM_CLOSE' status of qm and 'QP_INIT' and 'QP_CLOSE' status of queue are not actually used. Currently, driver only needs to switch status when the device or queue is enabled or stopped, Therefore, remove unneeded status to simplify driver. In addition, rename'QM_START to'QM_WORK' for ease to understand. Signed-off-by: Weili Qian Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index ddc7ebb705234..e3c0a1297b2c0 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -108,17 +108,13 @@ enum qm_stop_reason { }; enum qm_state { - QM_INIT = 0, - QM_START, - QM_CLOSE, + QM_WORK = 0, QM_STOP, }; enum qp_state { - QP_INIT = 1, - QP_START, + QP_START = 1, QP_STOP, - QP_CLOSE, }; enum qm_hw_ver { -- cgit v1.2.3 From 488e8f685207e0758398963d6834f81e5e61c162 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 30 Nov 2023 16:16:22 +0200 Subject: fs: fork splice_file_range() from do_splice_direct() In preparation of calling do_splice_direct() without file_start_write() held, create a new helper splice_file_range(), to be called from context of ->copy_file_range() methods instead of do_splice_direct(). Currently, the only difference is that splice_file_range() does not take flags argument and that it asserts that file_start_write() is held, but we factor out a common helper do_splice_direct_actor() that will be used later. Use the new helper from __ceph_copy_file_range(), that was incorrectly passing to do_splice_direct() the copy flags argument as splice flags. The value of copy flags in ceph is always 0, so it is a smenatic bug fix. Move the declaration of both helpers to linux/splice.h. Reviewed-by: Jan Kara Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231130141624.3338942-2-amir73il@gmail.com Acked-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 -- include/linux/splice.h | 13 ++++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ae0e2fb7bcea8..04422a0eccddd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3052,8 +3052,6 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos, size_t len, unsigned int flags); extern ssize_t iter_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); -extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - loff_t *opos, size_t len, unsigned int flags); extern void diff --git a/include/linux/splice.h b/include/linux/splice.h index 6c461573434dd..49532d5dda523 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -80,11 +80,14 @@ extern ssize_t add_to_pipe(struct pipe_inode_info *, long vfs_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); -extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, - splice_direct_actor *); -extern long do_splice(struct file *in, loff_t *off_in, - struct file *out, loff_t *off_out, - size_t len, unsigned int flags); +ssize_t splice_direct_to_actor(struct file *file, struct splice_desc *sd, + splice_direct_actor *actor); +long do_splice(struct file *in, loff_t *off_in, struct file *out, + loff_t *off_out, size_t len, unsigned int flags); +long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags); +long splice_file_range(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len); extern long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags); -- cgit v1.2.3 From 77070eeb882124614a40616f01bfe60947be5778 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 30 Nov 2023 15:43:27 -0500 Subject: cgroup: Avoid false cacheline sharing of read mostly rstat_cpu The rstat_cpu and also rstat_css_list of the cgroup structure are read mostly variables. However, they may share the same cacheline as the subsequent rstat_flush_next and *bstat variables which can be updated frequently. That will slow down the cgroup_rstat_cpu() call which is called pretty frequently in the rstat code. Add a CACHELINE_PADDING() line in between them to avoid false cacheline sharing. A parallel kernel build on a 2-socket x86-64 server is used as the benchmarking tool for measuring the lock hold time. Below were the lock hold time frequency distribution before and after the patch: Run time Before patch After patch -------- ------------ ----------- 0-01 us 9,928,562 9,820,428 01-05 us 110,151 50,935 05-10 us 270 93 10-15 us 273 146 15-20 us 135 76 20-25 us 0 2 25-30 us 1 0 It can be seen that the patch further pushes the lock hold time towards the lower end. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 37518436cfe7f..5a97ea95b5649 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -496,6 +496,13 @@ struct cgroup { struct cgroup_rstat_cpu __percpu *rstat_cpu; struct list_head rstat_css_list; + /* + * Add padding to separate the read mostly rstat_cpu and + * rstat_css_list into a different cacheline from the following + * rstat_flush_next and *bstat fields which can have frequent updates. + */ + CACHELINE_PADDING(_pad_); + /* * A singly-linked list of cgroup structures to be rstat flushed. * This is a scratch field to be used exclusively by -- cgit v1.2.3 From aabf7c37dfbce3e5fe24f0c86a34bc8f2f63cee8 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 29 Nov 2023 13:44:04 -0800 Subject: lkdtm: Add kfence read after free crash type Add the ability to allocate memory from kfence and trigger a read after free on that memory to validate that kfence is working properly. This is used by ChromeOS integration tests to validate that kfence errors can be collected on user devices and parsed properly. Cc: Alexander Potapenko Acked-by: Marco Elver Cc: Dmitry Vyukov Cc: Andrew Morton Cc: kasan-dev@googlegroups.com Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20231129214413.3156334-1-swboyd@chromium.org Signed-off-by: Kees Cook --- include/linux/kfence.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 401af47575141..88100cc9cabab 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -223,6 +223,8 @@ bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *sla #else /* CONFIG_KFENCE */ +#define kfence_sample_interval (0) + static inline bool is_kfence_address(const void *addr) { return false; } static inline void kfence_alloc_pool_and_metadata(void) { } static inline void kfence_init(void) { } -- cgit v1.2.3 From 12cd3cd8c797e07afcc47bc4afa760e4ec75e9d7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 20 Nov 2023 17:11:42 +0200 Subject: params: Introduce the param_unknown_fn type Introduce a new type for the callback to parse an unknown argument. This unifies function prototypes which takes that as a parameter. Reviewed-by: Luis Chamberlain Reviewed-by: Kees Cook Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20231120151419.1661807-2-andriy.shevchenko@linux.intel.com Signed-off-by: Kees Cook --- include/linux/moduleparam.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 4fa9726bc3282..bfb85fd13e1fa 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -385,6 +385,8 @@ extern bool parameq(const char *name1, const char *name2); */ extern bool parameqn(const char *name1, const char *name2, size_t n); +typedef int (*parse_unknown_fn)(char *param, char *val, const char *doing, void *arg); + /* Called on module insert or kernel boot */ extern char *parse_args(const char *name, char *args, @@ -392,9 +394,7 @@ extern char *parse_args(const char *name, unsigned num, s16 level_min, s16 level_max, - void *arg, - int (*unknown)(char *param, char *val, - const char *doing, void *arg)); + void *arg, parse_unknown_fn unknown); /* Called by module remove. */ #ifdef CONFIG_SYSFS -- cgit v1.2.3 From 492c5d455969fc2e829f26ed4c83487b068f0dd7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 30 Nov 2023 13:53:06 -0800 Subject: block: bio-integrity: directly map user buffers Passthrough commands that utilize metadata currently need to bounce the user space buffer through the kernel. Add support for mapping user space directly so that we can avoid this costly overhead. This is similar to how the normal bio data payload utilizes user addresses with bio_map_user_iov(). If the user address can't directly be used for reason, like too many segments or address unalignement, fallback to a copy of the user vec while keeping the user address pinned for the IO duration so that it can safely be copied on completion in any process context. Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20231130215309.2923568-2-kbusch@meta.com [axboe: fold in fix from Kanchan Joshi] Signed-off-by: Jens Axboe --- include/linux/bio.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 41d417ee13499..ec4db73e5f4ec 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -324,6 +324,8 @@ enum bip_flags { BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ + BIP_INTEGRITY_USER = 1 << 5, /* Integrity payload is user address */ + BIP_COPY_USER = 1 << 6, /* Kernel bounce buffer in use */ }; /* @@ -718,6 +720,7 @@ static inline bool bioset_initialized(struct bio_set *bs) for_each_bio(_bio) \ bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed); extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); extern bool bio_integrity_prep(struct bio *); @@ -789,6 +792,12 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page, return 0; } +static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, + ssize_t len, u32 seed) +{ + return -EINVAL; +} + #endif /* CONFIG_BLK_DEV_INTEGRITY */ /* -- cgit v1.2.3 From e5da71f1e373f36c7506ffa9a60ef7ec6e84674d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 30 Nov 2023 13:53:08 -0800 Subject: iouring: remove IORING_URING_CMD_POLLED No more users of this flag. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20231130215309.2923568-4-kbusch@meta.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index aefb73eeeebff..fe23bf88f86fa 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -28,7 +28,6 @@ enum io_uring_cmd_flags { /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */ #define IORING_URING_CMD_CANCELABLE (1U << 30) -#define IORING_URING_CMD_POLLED (1U << 31) struct io_uring_cmd { struct file *file; -- cgit v1.2.3 From 8fadb86d4ced8b8349a3b227d6d66736ff150819 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 30 Nov 2023 13:53:09 -0800 Subject: io_uring: remove uring_cmd cookie No more users of this field. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20231130215309.2923568-5-kbusch@meta.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index fe23bf88f86fa..9e6ce6d4ab51f 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -32,12 +32,8 @@ enum io_uring_cmd_flags { struct io_uring_cmd { struct file *file; const struct io_uring_sqe *sqe; - union { - /* callback to defer completions to task context */ - void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); - /* used for polled completion */ - void *cookie; - }; + /* callback to defer completions to task context */ + void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); u32 cmd_op; u32 flags; u8 pdu[32]; /* available inline for free use */ -- cgit v1.2.3 From 45b5623f2d721c25d1a2fdc8c4600fb4b7b61c75 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 2 Dec 2023 09:56:55 -0800 Subject: bpf: rearrange bpf_func_state fields to save a bit of memory It's a trivial rearrangement saving 8 bytes. We have 4 bytes of padding at the end which can be filled with another field without increasing struct bpf_func_state. copy_func_state() logic remains correct without any further changes. BEFORE ====== struct bpf_func_state { struct bpf_reg_state regs[11]; /* 0 1320 */ /* --- cacheline 20 boundary (1280 bytes) was 40 bytes ago --- */ int callsite; /* 1320 4 */ u32 frameno; /* 1324 4 */ u32 subprogno; /* 1328 4 */ u32 async_entry_cnt; /* 1332 4 */ bool in_callback_fn; /* 1336 1 */ /* XXX 7 bytes hole, try to pack */ /* --- cacheline 21 boundary (1344 bytes) --- */ struct tnum callback_ret_range; /* 1344 16 */ bool in_async_callback_fn; /* 1360 1 */ bool in_exception_callback_fn; /* 1361 1 */ /* XXX 2 bytes hole, try to pack */ int acquired_refs; /* 1364 4 */ struct bpf_reference_state * refs; /* 1368 8 */ int allocated_stack; /* 1376 4 */ /* XXX 4 bytes hole, try to pack */ struct bpf_stack_state * stack; /* 1384 8 */ /* size: 1392, cachelines: 22, members: 13 */ /* sum members: 1379, holes: 3, sum holes: 13 */ /* last cacheline: 48 bytes */ }; AFTER ===== struct bpf_func_state { struct bpf_reg_state regs[11]; /* 0 1320 */ /* --- cacheline 20 boundary (1280 bytes) was 40 bytes ago --- */ int callsite; /* 1320 4 */ u32 frameno; /* 1324 4 */ u32 subprogno; /* 1328 4 */ u32 async_entry_cnt; /* 1332 4 */ struct tnum callback_ret_range; /* 1336 16 */ /* --- cacheline 21 boundary (1344 bytes) was 8 bytes ago --- */ bool in_callback_fn; /* 1352 1 */ bool in_async_callback_fn; /* 1353 1 */ bool in_exception_callback_fn; /* 1354 1 */ /* XXX 1 byte hole, try to pack */ int acquired_refs; /* 1356 4 */ struct bpf_reference_state * refs; /* 1360 8 */ struct bpf_stack_state * stack; /* 1368 8 */ int allocated_stack; /* 1376 4 */ /* size: 1384, cachelines: 22, members: 13 */ /* sum members: 1379, holes: 1, sum holes: 1 */ /* padding: 4 */ /* last cacheline: 40 bytes */ }; Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231202175705.885270-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d99a636d36a7c..0c0e1bccad45d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -297,8 +297,8 @@ struct bpf_func_state { * void foo(void) { bpf_timer_set_callback(,foo); } */ u32 async_entry_cnt; - bool in_callback_fn; struct tnum callback_ret_range; + bool in_callback_fn; bool in_async_callback_fn; bool in_exception_callback_fn; /* For callback calling functions that limit number of possible @@ -316,8 +316,8 @@ struct bpf_func_state { /* The following fields should be last. See copy_func_state() */ int acquired_refs; struct bpf_reference_state *refs; - int allocated_stack; struct bpf_stack_state *stack; + int allocated_stack; }; struct bpf_idx_pair { -- cgit v1.2.3 From 8fa4ecd49b81ccd9d1d87f1c8b2260e218644878 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 2 Dec 2023 09:56:58 -0800 Subject: bpf: enforce exact retval range on subprog/callback exit Instead of relying on potentially imprecise tnum representation of expected return value range for callbacks and subprogs, validate that smin/smax range satisfy exact expected range of return values. E.g., if callback would need to return [0, 2] range, tnum can't represent this precisely and instead will allow [0, 3] range. By checking smin/smax range, we can make sure that subprog/callback indeed returns only valid [0, 2] range. Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231202175705.885270-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 0c0e1bccad45d..3378cc753061e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -275,6 +275,11 @@ struct bpf_reference_state { int callback_ref; }; +struct bpf_retval_range { + s32 minval; + s32 maxval; +}; + /* state of the program: * type of all registers and stack info */ @@ -297,7 +302,7 @@ struct bpf_func_state { * void foo(void) { bpf_timer_set_callback(,foo); } */ u32 async_entry_cnt; - struct tnum callback_ret_range; + struct bpf_retval_range callback_ret_range; bool in_callback_fn; bool in_async_callback_fn; bool in_exception_callback_fn; -- cgit v1.2.3 From aeb9ce058d7c6193dc41e06b3a5b29d22c446b14 Mon Sep 17 00:00:00 2001 From: Coco Li Date: Wed, 29 Nov 2023 07:27:53 +0000 Subject: cache: enforce cache groups Set up build time warnings to safeguard against future header changes of organized structs. Warning includes: 1) whether all variables are still in the same cache group 2) whether all the cache groups have the sum of the members size (in the maximum condition, including all members defined in configs) The __cache_group* variables are ignored in kernel-doc check in the various header files they appear in to enforce the cache groups. Suggested-by: Daniel Borkmann Acked-by: Daniel Borkmann Signed-off-by: Coco Li Reviewed-by: Eric Dumazet Reviewed-by: Shakeel Butt Signed-off-by: David S. Miller --- include/linux/cache.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cache.h b/include/linux/cache.h index 9900d20b76c28..0ecb17bb68837 100644 --- a/include/linux/cache.h +++ b/include/linux/cache.h @@ -85,6 +85,31 @@ #define cache_line_size() L1_CACHE_BYTES #endif +#ifndef __cacheline_group_begin +#define __cacheline_group_begin(GROUP) \ + __u8 __cacheline_group_begin__##GROUP[0] +#endif + +#ifndef __cacheline_group_end +#define __cacheline_group_end(GROUP) \ + __u8 __cacheline_group_end__##GROUP[0] +#endif + +#ifndef CACHELINE_ASSERT_GROUP_MEMBER +#define CACHELINE_ASSERT_GROUP_MEMBER(TYPE, GROUP, MEMBER) \ + BUILD_BUG_ON(!(offsetof(TYPE, MEMBER) >= \ + offsetofend(TYPE, __cacheline_group_begin__##GROUP) && \ + offsetofend(TYPE, MEMBER) <= \ + offsetof(TYPE, __cacheline_group_end__##GROUP))) +#endif + +#ifndef CACHELINE_ASSERT_GROUP_SIZE +#define CACHELINE_ASSERT_GROUP_SIZE(TYPE, GROUP, SIZE) \ + BUILD_BUG_ON(offsetof(TYPE, __cacheline_group_end__##GROUP) - \ + offsetofend(TYPE, __cacheline_group_begin__##GROUP) > \ + SIZE) +#endif + /* * Helper to add padding within a struct to ensure data fall into separate * cachelines. -- cgit v1.2.3 From a9c8c738066b7ba9e208cfc3200a6f60593982b4 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 6 Nov 2023 09:31:41 +0200 Subject: device property: Add fwnode_name_eq() Add fwnode_name_eq() to implement the functionality of of_node_name_eq() on fwnode property API. The same convention of ending the comparison at '@' (besides NUL) is applied on also both ACPI and swnode. The function is intended for comparing unit address-less node names on DT and firmware or swnodes compliant with DT bindings. Reviewed-by: Laurent Pinchart Tested-by: Laurent Pinchart Acked-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- include/linux/property.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 9f2585d705a86..4ebbb169df206 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -109,6 +109,7 @@ struct fwnode_handle *fwnode_find_reference(const struct fwnode_handle *fwnode, const char *fwnode_get_name(const struct fwnode_handle *fwnode); const char *fwnode_get_name_prefix(const struct fwnode_handle *fwnode); +bool fwnode_name_eq(const struct fwnode_handle *fwnode, const char *name); struct fwnode_handle *fwnode_get_parent(const struct fwnode_handle *fwnode); struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode); -- cgit v1.2.3 From 578dc962ff2000ba4bf52d50717aea0819615634 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Fri, 24 Nov 2023 17:24:35 -0800 Subject: mtd: rawnand: Add destructive operation Erase and program operations need the write protect (wp) pin to be de-asserted to take effect. Add the concept of destructive operation and pass the information to exec_op() so controllers know when they should de-assert this pin without having to decode the command opcode. Signed-off-by: Boris Brezillon Signed-off-by: David Regan Reviewed-by: Florian Fainelli Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231125012438.15191-1-dregan@broadcom.com --- include/linux/mtd/rawnand.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index c29ace15a053a..bd02aba5e6e3e 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -1003,6 +1003,8 @@ struct nand_op_parser { /** * struct nand_operation - NAND operation descriptor * @cs: the CS line to select for this NAND operation + * @deassert_wp: set to true when the operation requires the WP pin to be + * de-asserted (ERASE, PROG, ...) * @instrs: array of instructions to execute * @ninstrs: length of the @instrs array * @@ -1010,6 +1012,7 @@ struct nand_op_parser { */ struct nand_operation { unsigned int cs; + bool deassert_wp; const struct nand_op_instr *instrs; unsigned int ninstrs; }; @@ -1021,6 +1024,14 @@ struct nand_operation { .ninstrs = ARRAY_SIZE(_instrs), \ } +#define NAND_DESTRUCTIVE_OPERATION(_cs, _instrs) \ + { \ + .cs = _cs, \ + .deassert_wp = true, \ + .instrs = _instrs, \ + .ninstrs = ARRAY_SIZE(_instrs), \ + } + int nand_op_parser_exec_op(struct nand_chip *chip, const struct nand_op_parser *parser, const struct nand_operation *op, bool check_only); -- cgit v1.2.3 From 68cce21e3cc5fea8d955a62394454149270c98bc Mon Sep 17 00:00:00 2001 From: David Regan Date: Fri, 24 Nov 2023 17:24:36 -0800 Subject: mtd: rawnand: NAND controller write protect Allow NAND controller to be responsible for write protect pin handling during fast path and exec_op destructive operation when controller_wp flag is set. Signed-off-by: David Regan Reviewed-by: Florian Fainelli Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20231125012438.15191-2-dregan@broadcom.com --- include/linux/mtd/rawnand.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index bd02aba5e6e3e..a17f795070d84 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -1115,6 +1115,7 @@ struct nand_controller_ops { * the bus without restarting an entire read operation nor * changing the column. * @supported_op.cont_read: The controller supports sequential cache reads. + * @controller_wp: the controller is in charge of handling the WP pin. */ struct nand_controller { struct mutex lock; @@ -1123,6 +1124,7 @@ struct nand_controller { unsigned int data_only_read: 1; unsigned int cont_read: 1; } supported_op; + bool controller_wp; }; static inline void nand_controller_init(struct nand_controller *nfc) -- cgit v1.2.3 From a87b8e3be926af0fc3b9b1af42b1127bd1ff077c Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 1 Dec 2023 10:29:51 -0800 Subject: usb: core: Allow subclassed USB drivers to override usb_choose_configuration() For some USB devices we might want to do something different for usb_choose_configuration(). One example here is the r8152 driver where we want to end up using the vendor driver with the preferred interface. The r8152 driver tried to make things work by implementing a USB generic_subclass driver and then overriding the normal config selection after it happened. This is less than ideal and also caused breakage if someone deauthorized and re-authorized the USB device because the USB core ended up going back to it's default logic for choosing the best config. I made an attempt to fix this [1] but it was a bit ugly. Let's do this better and allow USB generic_subclass drivers to override usb_choose_configuration(). [1] https://lore.kernel.org/r/20231130154337.1.Ie00e07f07f87149c9ce0b27ae4e26991d307e14b@changeid Suggested-by: Alan Stern Signed-off-by: Douglas Anderson Reviewed-by: Alan Stern Link: https://lore.kernel.org/r/20231201102946.v2.2.Iade5fa31997f1a0ca3e1dec0591633b02471df12@changeid Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 8c61643acd499..618e5a0b1a223 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1264,6 +1264,9 @@ struct usb_driver { * module is being unloaded. * @suspend: Called when the device is going to be suspended by the system. * @resume: Called when the device is being resumed by the system. + * @choose_configuration: If non-NULL, called instead of the default + * usb_choose_configuration(). If this returns an error then we'll go + * on to call the normal usb_choose_configuration(). * @dev_groups: Attributes attached to the device that will be created once it * is bound to the driver. * @drvwrap: Driver-model core structure wrapper. @@ -1287,6 +1290,9 @@ struct usb_device_driver { int (*suspend) (struct usb_device *udev, pm_message_t message); int (*resume) (struct usb_device *udev, pm_message_t message); + + int (*choose_configuration) (struct usb_device *udev); + const struct attribute_group **dev_groups; struct usbdrv_wrap drvwrap; const struct usb_device_id *id_table; -- cgit v1.2.3 From db9e54709895241dda23f9347f619afb15291353 Mon Sep 17 00:00:00 2001 From: RD Babiera Date: Tue, 21 Nov 2023 20:38:47 +0000 Subject: usb: typec: tcpm: add tcpm_port_error_recovery symbol Add tcpm_port_error_recovery symbol and corresponding event that runs in tcpm_pd_event handler to set the port to the ERROR_RECOVERY state. tcpci drivers can use the symbol to reset the port when tcpc faults affect port functionality. Signed-off-by: RD Babiera Reviewed-by: Heikki Krogerus Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20231121203845.170234-5-rdbabiera@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/tcpm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h index ab7ca872950bb..65fac5e1f3178 100644 --- a/include/linux/usb/tcpm.h +++ b/include/linux/usb/tcpm.h @@ -173,5 +173,6 @@ void tcpm_pd_hard_reset(struct tcpm_port *port); void tcpm_tcpc_reset(struct tcpm_port *port); void tcpm_port_clean(struct tcpm_port *port); bool tcpm_port_is_toggling(struct tcpm_port *port); +void tcpm_port_error_recovery(struct tcpm_port *port); #endif /* __LINUX_USB_TCPM_H */ -- cgit v1.2.3 From 5e4c8814a431d21bfaf20b464134f40f2f81e152 Mon Sep 17 00:00:00 2001 From: RD Babiera Date: Tue, 21 Nov 2023 20:38:48 +0000 Subject: usb: typec: tcpci: add vconn over current fault handling to maxim_core Add TCPC_FAULT_STATUS_VCONN_OC constant and corresponding mask definition. Maxim TCPC is capable of detecting VConn over current faults, so add fault to alert mask. When a Vconn over current fault is triggered, put the port in an error recovery state via tcpm_port_error_recovery. Signed-off-by: RD Babiera Reviewed-by: Guenter Roeck Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20231121203845.170234-6-rdbabiera@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/tcpci.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h index 83376473ac765..467e8045e9f86 100644 --- a/include/linux/usb/tcpci.h +++ b/include/linux/usb/tcpci.h @@ -36,7 +36,9 @@ #define TCPC_ALERT_MASK 0x12 #define TCPC_POWER_STATUS_MASK 0x14 -#define TCPC_FAULT_STATUS_MASK 0x15 + +#define TCPC_FAULT_STATUS_MASK 0x15 +#define TCPC_FAULT_STATUS_MASK_VCONN_OC BIT(1) #define TCPC_EXTENDED_STATUS_MASK 0x16 #define TCPC_EXTENDED_STATUS_MASK_VSAFE0V BIT(0) @@ -104,6 +106,7 @@ #define TCPC_FAULT_STATUS 0x1f #define TCPC_FAULT_STATUS_ALL_REG_RST_TO_DEFAULT BIT(7) +#define TCPC_FAULT_STATUS_VCONN_OC BIT(1) #define TCPC_ALERT_EXTENDED 0x21 -- cgit v1.2.3 From 7cc4e6b0e4ddf610477fcec8e3d2a9caae7e8a6c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 29 Nov 2023 18:06:46 +0200 Subject: pinctrl: Convert unsigned to unsigned int Simple type conversion with no functional change implied. While at it, adjust indentation where it makes sense. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20231129161459.1002323-24-andriy.shevchenko@linux.intel.com Signed-off-by: Linus Walleij --- include/linux/pinctrl/machine.h | 6 +++--- include/linux/pinctrl/pinconf-generic.h | 10 +++++----- include/linux/pinctrl/pinconf.h | 16 ++++++++-------- include/linux/pinctrl/pinctrl.h | 24 ++++++++++++------------ include/linux/pinctrl/pinmux.h | 22 +++++++++++----------- 5 files changed, 39 insertions(+), 39 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pinctrl/machine.h b/include/linux/pinctrl/machine.h index ee8803f6ad07c..673e96df453b3 100644 --- a/include/linux/pinctrl/machine.h +++ b/include/linux/pinctrl/machine.h @@ -47,7 +47,7 @@ struct pinctrl_map_mux { struct pinctrl_map_configs { const char *group_or_pin; unsigned long *configs; - unsigned num_configs; + unsigned int num_configs; }; /** @@ -154,13 +154,13 @@ struct pinctrl_map; #ifdef CONFIG_PINCTRL extern int pinctrl_register_mappings(const struct pinctrl_map *map, - unsigned num_maps); + unsigned int num_maps); extern void pinctrl_unregister_mappings(const struct pinctrl_map *map); extern void pinctrl_provide_dummies(void); #else static inline int pinctrl_register_mappings(const struct pinctrl_map *map, - unsigned num_maps) + unsigned int num_maps) { return 0; } diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index d74b7a4ea154d..a65d3d078e58b 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -193,17 +193,17 @@ struct pinconf_generic_params { int pinconf_generic_dt_subnode_to_map(struct pinctrl_dev *pctldev, struct device_node *np, struct pinctrl_map **map, - unsigned *reserved_maps, unsigned *num_maps, + unsigned int *reserved_maps, unsigned int *num_maps, enum pinctrl_map_type type); int pinconf_generic_dt_node_to_map(struct pinctrl_dev *pctldev, struct device_node *np_config, struct pinctrl_map **map, - unsigned *num_maps, enum pinctrl_map_type type); + unsigned int *num_maps, enum pinctrl_map_type type); void pinconf_generic_dt_free_map(struct pinctrl_dev *pctldev, - struct pinctrl_map *map, unsigned num_maps); + struct pinctrl_map *map, unsigned int num_maps); static inline int pinconf_generic_dt_node_to_map_group(struct pinctrl_dev *pctldev, struct device_node *np_config, struct pinctrl_map **map, - unsigned *num_maps) + unsigned int *num_maps) { return pinconf_generic_dt_node_to_map(pctldev, np_config, map, num_maps, PIN_MAP_TYPE_CONFIGS_GROUP); @@ -211,7 +211,7 @@ static inline int pinconf_generic_dt_node_to_map_group(struct pinctrl_dev *pctld static inline int pinconf_generic_dt_node_to_map_pin(struct pinctrl_dev *pctldev, struct device_node *np_config, struct pinctrl_map **map, - unsigned *num_maps) + unsigned int *num_maps) { return pinconf_generic_dt_node_to_map(pctldev, np_config, map, num_maps, PIN_MAP_TYPE_CONFIGS_PIN); diff --git a/include/linux/pinctrl/pinconf.h b/include/linux/pinctrl/pinconf.h index f8a8215e9021e..770ec2221156c 100644 --- a/include/linux/pinctrl/pinconf.h +++ b/include/linux/pinctrl/pinconf.h @@ -40,25 +40,25 @@ struct pinconf_ops { bool is_generic; #endif int (*pin_config_get) (struct pinctrl_dev *pctldev, - unsigned pin, + unsigned int pin, unsigned long *config); int (*pin_config_set) (struct pinctrl_dev *pctldev, - unsigned pin, + unsigned int pin, unsigned long *configs, - unsigned num_configs); + unsigned int num_configs); int (*pin_config_group_get) (struct pinctrl_dev *pctldev, - unsigned selector, + unsigned int selector, unsigned long *config); int (*pin_config_group_set) (struct pinctrl_dev *pctldev, - unsigned selector, + unsigned int selector, unsigned long *configs, - unsigned num_configs); + unsigned int num_configs); void (*pin_config_dbg_show) (struct pinctrl_dev *pctldev, struct seq_file *s, - unsigned offset); + unsigned int offset); void (*pin_config_group_dbg_show) (struct pinctrl_dev *pctldev, struct seq_file *s, - unsigned selector); + unsigned int selector); void (*pin_config_config_dbg_show) (struct pinctrl_dev *pctldev, struct seq_file *s, unsigned long config); diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h index 4d252ea00ed1a..9a8189ffd0f2c 100644 --- a/include/linux/pinctrl/pinctrl.h +++ b/include/linux/pinctrl/pinctrl.h @@ -54,7 +54,7 @@ struct pingroup { * @drv_data: driver-defined per-pin data. pinctrl core does not touch this */ struct pinctrl_pin_desc { - unsigned number; + unsigned int number; const char *name; void *drv_data; }; @@ -82,7 +82,7 @@ struct pinctrl_gpio_range { unsigned int base; unsigned int pin_base; unsigned int npins; - unsigned const *pins; + unsigned int const *pins; struct gpio_chip *gc; }; @@ -108,18 +108,18 @@ struct pinctrl_gpio_range { struct pinctrl_ops { int (*get_groups_count) (struct pinctrl_dev *pctldev); const char *(*get_group_name) (struct pinctrl_dev *pctldev, - unsigned selector); + unsigned int selector); int (*get_group_pins) (struct pinctrl_dev *pctldev, - unsigned selector, - const unsigned **pins, - unsigned *num_pins); + unsigned int selector, + const unsigned int **pins, + unsigned int *num_pins); void (*pin_dbg_show) (struct pinctrl_dev *pctldev, struct seq_file *s, - unsigned offset); + unsigned int offset); int (*dt_node_to_map) (struct pinctrl_dev *pctldev, struct device_node *np_config, - struct pinctrl_map **map, unsigned *num_maps); + struct pinctrl_map **map, unsigned int *num_maps); void (*dt_free_map) (struct pinctrl_dev *pctldev, - struct pinctrl_map *map, unsigned num_maps); + struct pinctrl_map *map, unsigned int num_maps); }; /** @@ -193,7 +193,7 @@ extern void pinctrl_add_gpio_range(struct pinctrl_dev *pctldev, struct pinctrl_gpio_range *range); extern void pinctrl_add_gpio_ranges(struct pinctrl_dev *pctldev, struct pinctrl_gpio_range *ranges, - unsigned nranges); + unsigned int nranges); extern void pinctrl_remove_gpio_range(struct pinctrl_dev *pctldev, struct pinctrl_gpio_range *range); @@ -203,8 +203,8 @@ extern struct pinctrl_gpio_range * pinctrl_find_gpio_range_from_pin(struct pinctrl_dev *pctldev, unsigned int pin); extern int pinctrl_get_group_pins(struct pinctrl_dev *pctldev, - const char *pin_group, const unsigned **pins, - unsigned *num_pins); + const char *pin_group, const unsigned int **pins, + unsigned int *num_pins); /** * struct pinfunction - Description about a function diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h index a7e370965c531..d6f7b58d6ad0c 100644 --- a/include/linux/pinctrl/pinmux.h +++ b/include/linux/pinctrl/pinmux.h @@ -57,26 +57,26 @@ struct pinctrl_gpio_range; * the pin request. */ struct pinmux_ops { - int (*request) (struct pinctrl_dev *pctldev, unsigned offset); - int (*free) (struct pinctrl_dev *pctldev, unsigned offset); + int (*request) (struct pinctrl_dev *pctldev, unsigned int offset); + int (*free) (struct pinctrl_dev *pctldev, unsigned int offset); int (*get_functions_count) (struct pinctrl_dev *pctldev); const char *(*get_function_name) (struct pinctrl_dev *pctldev, - unsigned selector); + unsigned int selector); int (*get_function_groups) (struct pinctrl_dev *pctldev, - unsigned selector, - const char * const **groups, - unsigned *num_groups); - int (*set_mux) (struct pinctrl_dev *pctldev, unsigned func_selector, - unsigned group_selector); + unsigned int selector, + const char * const **groups, + unsigned int *num_groups); + int (*set_mux) (struct pinctrl_dev *pctldev, unsigned int func_selector, + unsigned int group_selector); int (*gpio_request_enable) (struct pinctrl_dev *pctldev, struct pinctrl_gpio_range *range, - unsigned offset); + unsigned int offset); void (*gpio_disable_free) (struct pinctrl_dev *pctldev, struct pinctrl_gpio_range *range, - unsigned offset); + unsigned int offset); int (*gpio_set_direction) (struct pinctrl_dev *pctldev, struct pinctrl_gpio_range *range, - unsigned offset, + unsigned int offset, bool input); bool strict; }; -- cgit v1.2.3 From 2202844e4468c7539dba0c0b06577c93735af952 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Mon, 6 Nov 2023 15:22:23 +0800 Subject: vfio/migration: Add debugfs to live migration driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are multiple devices, software and operational steps involved in the process of live migration. An error occurred on any node may cause the live migration operation to fail. This complex process makes it very difficult to locate and analyze the cause when the function fails. In order to quickly locate the cause of the problem when the live migration fails, I added a set of debugfs to the vfio live migration driver. +-------------------------------------------+ | | | | | QEMU | | | | | +---+----------------------------+----------+ | ^ | ^ | | | | | | | | v | v | +---------+--+ +---------+--+ |src vfio_dev| |dst vfio_dev| +--+---------+ +--+---------+ | ^ | ^ | | | | v | | | +-----------+----+ +-----------+----+ |src dev debugfs | |dst dev debugfs | +----------------+ +----------------+ The entire debugfs directory will be based on the definition of the CONFIG_DEBUG_FS macro. If this macro is not enabled, the interfaces in vfio.h will be empty definitions, and the creation and initialization of the debugfs directory will not be executed. vfio | +--- | +---migration | +--state | +--- +---migration +--state debugfs will create a public root directory "vfio" file. then create a dev_name() file for each live migration device. First, create a unified state acquisition file of "migration" in this device directory. Then, create a public live migration state lookup file "state". Signed-off-by: Longfang Liu Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20231106072225.28577-2-liulongfang@huawei.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index a65b2513f8cdc..89b265bc6ec31 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -69,6 +69,13 @@ struct vfio_device { u8 iommufd_attached:1; #endif u8 cdev_opened:1; +#ifdef CONFIG_DEBUG_FS + /* + * debug_root is a static property of the vfio_device + * which must be set prior to registering the vfio_device. + */ + struct dentry *debug_root; +#endif }; /** -- cgit v1.2.3 From 20c20bd11a0702ce4dc9300c3da58acf551d9725 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:20 +0800 Subject: bpf: Add map and need_defer parameters to .map_fd_put_ptr() map is the pointer of outer map, and need_defer needs some explanation. need_defer tells the implementation to defer the reference release of the passed element and ensure that the element is still alive before the bpf program, which may manipulate it, exits. The following three cases will invoke map_fd_put_ptr() and different need_defer values will be passed to these callers: 1) release the reference of the old element in the map during map update or map deletion. The release must be deferred, otherwise the bpf program may incur use-after-free problem, so need_defer needs to be true. 2) release the reference of the to-be-added element in the error path of map update. The to-be-added element is not visible to any bpf program, so it is OK to pass false for need_defer parameter. 3) release the references of all elements in the map during map release. Any bpf program which has access to the map must have been exited and released, so need_defer=false will be OK. These two parameters will be used by the following patches to fix the potential use-after-free problem for map-in-map. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eb447b0a94231..d273348cfb2fb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -106,7 +106,11 @@ struct bpf_map_ops { /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); - void (*map_fd_put_ptr)(void *ptr); + /* If need_defer is true, the implementation should guarantee that + * the to-be-put element is still alive before the bpf program, which + * may manipulate it, exists. + */ + void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer); int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, -- cgit v1.2.3 From 876673364161da50eed6b472d746ef88242b2368 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:22 +0800 Subject: bpf: Defer the free of inner map when necessary When updating or deleting an inner map in map array or map htab, the map may still be accessed by non-sleepable program or sleepable program. However bpf_map_fd_put_ptr() decreases the ref-counter of the inner map directly through bpf_map_put(), if the ref-counter is the last one (which is true for most cases), the inner map will be freed by ops->map_free() in a kworker. But for now, most .map_free() callbacks don't use synchronize_rcu() or its variants to wait for the elapse of a RCU grace period, so after the invocation of ops->map_free completes, the bpf program which is accessing the inner map may incur use-after-free problem. Fix the free of inner map by invoking bpf_map_free_deferred() after both one RCU grace period and one tasks trace RCU grace period if the inner map has been removed from the outer map before. The deferment is accomplished by using call_rcu() or call_rcu_tasks_trace() when releasing the last ref-counter of bpf map. The newly-added rcu_head field in bpf_map shares the same storage space with work field to reduce the size of bpf_map. Fixes: bba1dc0b55ac ("bpf: Remove redundant synchronize_rcu.") Fixes: 638e4b825d52 ("bpf: Allows per-cpu maps and map-in-map in sleepable programs") Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-5-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d273348cfb2fb..de3bd03cbeea3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -276,7 +276,11 @@ struct bpf_map { */ atomic64_t refcnt ____cacheline_aligned; atomic64_t usercnt; - struct work_struct work; + /* rcu is used before freeing and work is only used during freeing */ + union { + struct work_struct work; + struct rcu_head rcu; + }; struct mutex freeze_mutex; atomic64_t writecnt; /* 'Ownership' of program-containing map is claimed by the first program @@ -292,6 +296,7 @@ struct bpf_map { } owner; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ + bool free_after_mult_rcu_gp; s64 __percpu *elem_count; }; -- cgit v1.2.3 From af66bfd3c8538ed21cf72af18426fc4a408665cf Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:23 +0800 Subject: bpf: Optimize the free of inner map When removing the inner map from the outer map, the inner map will be freed after one RCU grace period and one RCU tasks trace grace period, so it is certain that the bpf program, which may access the inner map, has exited before the inner map is freed. However there is no need to wait for one RCU tasks trace grace period if the outer map is only accessed by non-sleepable program. So adding sleepable_refcnt in bpf_map and increasing sleepable_refcnt when adding the outer map into env->used_maps for sleepable program. Although the max number of bpf program is INT_MAX - 1, the number of bpf programs which are being loaded may be greater than INT_MAX, so using atomic64_t instead of atomic_t for sleepable_refcnt. When removing the inner map from the outer map, using sleepable_refcnt to decide whether or not a RCU tasks trace grace period is needed before freeing the inner map. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-6-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index de3bd03cbeea3..10e5e4d8a00fa 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -297,6 +297,8 @@ struct bpf_map { bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ bool free_after_mult_rcu_gp; + bool free_after_rcu_gp; + atomic64_t sleepable_refcnt; s64 __percpu *elem_count; }; -- cgit v1.2.3 From 2a502ff0c4e42a739b5aa550c901bf3852795532 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 1 Dec 2023 15:28:34 -0800 Subject: net: Add queue and napi association Add the napi pointer in netdev queue for tracking the napi instance for each queue. This achieves the queue<->napi mapping. Signed-off-by: Amritha Nambiar Reviewed-by: Sridhar Samudrala Link: https://lore.kernel.org/r/170147331483.5260.15723438819994285695.stgit@anambiarhost.jf.intel.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c2d74bc112ddc..5ddff11cbe260 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -665,6 +665,10 @@ struct netdev_queue { #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif + /* NAPI instance for the queue + * Readers and writers must hold RTNL + */ + struct napi_struct *napi; /* * write-mostly part */ @@ -2657,6 +2661,10 @@ static inline void *netdev_priv(const struct net_device *dev) */ #define SET_NETDEV_DEVTYPE(net, devtype) ((net)->dev.type = (devtype)) +void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, + enum netdev_queue_type type, + struct napi_struct *napi); + /* Default NAPI poll() weight * Device drivers are strongly advised to not use bigger value */ -- cgit v1.2.3 From 26793bfb5d6072326d1465343e7cbf6156abca4f Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 1 Dec 2023 15:29:07 -0800 Subject: net: Add NAPI IRQ support Add support to associate the interrupt vector number for a NAPI instance. Signed-off-by: Amritha Nambiar Reviewed-by: Sridhar Samudrala Link: https://lore.kernel.org/r/170147334728.5260.13221803396905901904.stgit@anambiarhost.jf.intel.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5ddff11cbe260..5551177e024e0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -382,6 +382,7 @@ struct napi_struct { /* control-path-only fields follow */ struct list_head dev_list; struct hlist_node napi_hash_node; + int irq; }; enum { @@ -2665,6 +2666,11 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, enum netdev_queue_type type, struct napi_struct *napi); +static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) +{ + napi->irq = irq; +} + /* Default NAPI poll() weight * Device drivers are strongly advised to not use bigger value */ -- cgit v1.2.3 From 37e4b8df27bc68340f3fc80dbb27e3549c7f881c Mon Sep 17 00:00:00 2001 From: Jianheng Zhang Date: Fri, 1 Dec 2023 03:22:03 +0000 Subject: net: stmmac: fix FPE events losing The status bits of register MAC_FPE_CTRL_STS are clear on read. Using 32-bit read for MAC_FPE_CTRL_STS in dwmac5_fpe_configure() and dwmac5_fpe_send_mpacket() clear the status bits. Then the stmmac interrupt handler missing FPE event status and leads to FPE handshaking failure and retries. To avoid clear status bits of MAC_FPE_CTRL_STS in dwmac5_fpe_configure() and dwmac5_fpe_send_mpacket(), add fpe_csr to stmmac_fpe_cfg structure to cache the control bits of MAC_FPE_CTRL_STS and to avoid reading MAC_FPE_CTRL_STS in those methods. Fixes: 5a5586112b92 ("net: stmmac: support FPE link partner hand-shaking procedure") Reviewed-by: Serge Semin Signed-off-by: Jianheng Zhang Link: https://lore.kernel.org/r/CY5PR12MB637225A7CF529D5BE0FBE59CBF81A@CY5PR12MB6372.namprd12.prod.outlook.com Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 0b4658a7eceb6..dee5ad6e48c5a 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -175,6 +175,7 @@ struct stmmac_fpe_cfg { bool hs_enable; /* FPE handshake enable */ enum stmmac_fpe_state lp_fpe_state; /* Link Partner FPE state */ enum stmmac_fpe_state lo_fpe_state; /* Local station FPE state */ + u32 fpe_csr; /* MAC_FPE_CTRL_STS reg cache */ }; struct stmmac_safety_feature_cfg { -- cgit v1.2.3 From a5e400a985df8041ed4659ed1462aa9134318130 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 20 Aug 2023 20:58:56 +0300 Subject: net/mlx5e: Honor user choice of IPsec replay window size Users can configure IPsec replay window size, but mlx5 driver didn't honor their choice and set always 32bits. Fix assignment logic to configure right size from the beginning. Fixes: 7db21ef4566e ("net/mlx5e: Set IPsec replay sequence numbers") Reviewed-by: Patrisious Haddad Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6f3631425f386..90ca63f4bf63d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -12001,6 +12001,13 @@ enum { MLX5_IPSEC_ASO_INC_SN = 0x2, }; +enum { + MLX5_IPSEC_ASO_REPLAY_WIN_32BIT = 0x0, + MLX5_IPSEC_ASO_REPLAY_WIN_64BIT = 0x1, + MLX5_IPSEC_ASO_REPLAY_WIN_128BIT = 0x2, + MLX5_IPSEC_ASO_REPLAY_WIN_256BIT = 0x3, +}; + struct mlx5_ifc_ipsec_aso_bits { u8 valid[0x1]; u8 reserved_at_201[0x1]; -- cgit v1.2.3 From c2bf84f1d1a1595dcc45fe867f0e02b331993fee Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 12 Nov 2023 13:50:00 +0200 Subject: net/mlx5e: Tidy up IPsec NAT-T SA discovery IPsec NAT-T packets are UDP encapsulated packets over ESP normal ones. In case they arrive to RX, the SPI and ESP are located in inner header, while the check was performed on outer header instead. That wrong check caused to the situation where received rekeying request was missed and caused to rekey timeout, which "compensated" this failure by completing rekeying. Fixes: d65954934937 ("net/mlx5e: Support IPsec NAT-T functionality") Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 90ca63f4bf63d..3f7b664d625b9 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -621,7 +621,7 @@ struct mlx5_ifc_fte_match_set_misc_bits { u8 reserved_at_140[0x8]; u8 bth_dst_qp[0x18]; - u8 reserved_at_160[0x20]; + u8 inner_esp_spi[0x20]; u8 outer_esp_spi[0x20]; u8 reserved_at_1a0[0x60]; }; -- cgit v1.2.3 From 70da1d01edf6da3fde1df98b2125a77083a0fb82 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 2 Oct 2023 16:36:55 +0200 Subject: cpu/hotplug: remove CPUHP_SLAB_PREPARE hooks The CPUHP_SLAB_PREPARE hooks are only used by SLAB which is removed. SLUB defines them as NULL, so we can remove those altogether. Acked-by: Thomas Gleixner Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- include/linux/cpuhotplug.h | 1 - include/linux/slab.h | 8 -------- 2 files changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index d305db70674bb..07cb8f7030b67 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -108,7 +108,6 @@ enum cpuhp_state { CPUHP_X2APIC_PREPARE, CPUHP_SMPCFD_PREPARE, CPUHP_RELAY_PREPARE, - CPUHP_SLAB_PREPARE, CPUHP_MD_RAID5_PREPARE, CPUHP_RCUTREE_PREP, CPUHP_CPUIDLE_COUPLED_PREPARE, diff --git a/include/linux/slab.h b/include/linux/slab.h index d6d6ffeeb9a2a..34e43cddc520f 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -788,12 +788,4 @@ size_t kmalloc_size_roundup(size_t size); void __init kmem_cache_init_late(void); -#if defined(CONFIG_SMP) && defined(CONFIG_SLAB) -int slab_prepare_cpu(unsigned int cpu); -int slab_dead_cpu(unsigned int cpu); -#else -#define slab_prepare_cpu NULL -#define slab_dead_cpu NULL -#endif - #endif /* _LINUX_SLAB_H */ -- cgit v1.2.3 From a9e0b9f27266d46ed6e73aac8d0844602cd0cb93 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 2 Oct 2023 17:43:38 +0200 Subject: mm/slab: remove CONFIG_SLAB code from slab common code In slab_common.c and slab.h headers, we can now remove all code behind CONFIG_SLAB and CONFIG_DEBUG_SLAB ifdefs, and remove all CONFIG_SLUB ifdefs. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 34e43cddc520f..b2015d0e01ad8 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -24,7 +24,7 @@ /* * Flags to pass to kmem_cache_create(). - * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set. + * The ones marked DEBUG need CONFIG_SLUB_DEBUG enabled, otherwise are no-op */ /* DEBUG: Perform (expensive) checks on alloc/free */ #define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100U) @@ -302,25 +302,15 @@ static inline unsigned int arch_slab_minalign(void) * Kmalloc array related definitions */ -#ifdef CONFIG_SLAB /* - * SLAB and SLUB directly allocates requests fitting in to an order-1 page + * SLUB directly allocates requests fitting in to an order-1 page * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) #define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) #ifndef KMALLOC_SHIFT_LOW -#define KMALLOC_SHIFT_LOW 5 -#endif -#endif - -#ifdef CONFIG_SLUB -#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) -#ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif -#endif /* Maximum allocatable size */ #define KMALLOC_MAX_SIZE (1UL << KMALLOC_SHIFT_MAX) -- cgit v1.2.3 From 6ac805d13870925c787a28e3fe5cc73610cacd03 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 4 Dec 2023 10:47:49 -0700 Subject: iov_iter: remove unused 'iov' argument from import_single_range() It is entirely unused, just get rid of it. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20231204174827.1258875-2-axboe@kernel.dk Signed-off-by: Christian Brauner --- include/linux/uio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index b6214cbf2a43f..bfafd3542fa76 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -348,7 +348,7 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat); int import_single_range(int type, void __user *buf, size_t len, - struct iovec *iov, struct iov_iter *i); + struct iov_iter *i); int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i); static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, -- cgit v1.2.3 From 9fd7874c0e5c89d7da0b4442271696ec0f8edcba Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 4 Dec 2023 10:47:50 -0700 Subject: iov_iter: replace import_single_range() with import_ubuf() With the removal of the 'iov' argument to import_single_range(), the two functions are now fully identical. Convert the import_single_range() callers to import_ubuf(), and remove the former fully. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/20231204174827.1258875-3-axboe@kernel.dk Signed-off-by: Christian Brauner --- include/linux/uio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index bfafd3542fa76..bea9c89922d90 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -347,8 +347,6 @@ ssize_t import_iovec(int type, const struct iovec __user *uvec, ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat); -int import_single_range(int type, void __user *buf, size_t len, - struct iov_iter *i); int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i); static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, -- cgit v1.2.3 From 118eb89b1e7f6807776c012cffc5c9b07fd26164 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 15 Nov 2023 14:58:05 +0530 Subject: drivers: perf: arm_pmu: Drop 'pmu_lock' element from 'struct pmu_hw_events' As 'pmu_lock' element is not being used in any ARM PMU implementation, just drop this from 'struct pmu_hw_events'. Cc: Will Deacon Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20231115092805.737822-3-anshuman.khandual@arm.com Signed-off-by: Will Deacon --- include/linux/perf/arm_pmu.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 143fbc10ecfe0..e2503d48ddee6 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -59,12 +59,6 @@ struct pmu_hw_events { */ DECLARE_BITMAP(used_mask, ARMPMU_MAX_HWEVENTS); - /* - * Hardware lock to serialize accesses to PMU registers. Needed for the - * read/modify/write sequences. - */ - raw_spinlock_t pmu_lock; - /* * When using percpu IRQs, we need a percpu dev_id. Place it here as we * already have to allocate this struct per cpu. -- cgit v1.2.3 From 41f6f64e6999a837048b1bd13a2f8742964eca6b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Dec 2023 10:42:39 -0800 Subject: bpf: support non-r10 register spill/fill to/from stack in precision tracking Use instruction (jump) history to record instructions that performed register spill/fill to/from stack, regardless if this was done through read-only r10 register, or any other register after copying r10 into it *and* potentially adjusting offset. To make this work reliably, we push extra per-instruction flags into instruction history, encoding stack slot index (spi) and stack frame number in extra 10 bit flags we take away from prev_idx in instruction history. We don't touch idx field for maximum performance, as it's checked most frequently during backtracking. This change removes basically the last remaining practical limitation of precision backtracking logic in BPF verifier. It fixes known deficiencies, but also opens up new opportunities to reduce number of verified states, explored in the subsequent patches. There are only three differences in selftests' BPF object files according to veristat, all in the positive direction (less states). File Program Insns (A) Insns (B) Insns (DIFF) States (A) States (B) States (DIFF) -------------------------------------- ------------- --------- --------- ------------- ---------- ---------- ------------- test_cls_redirect_dynptr.bpf.linked3.o cls_redirect 2987 2864 -123 (-4.12%) 240 231 -9 (-3.75%) xdp_synproxy_kern.bpf.linked3.o syncookie_tc 82848 82661 -187 (-0.23%) 5107 5073 -34 (-0.67%) xdp_synproxy_kern.bpf.linked3.o syncookie_xdp 85116 84964 -152 (-0.18%) 5162 5130 -32 (-0.62%) Note, I avoided renaming jmp_history to more generic insn_hist to minimize number of lines changed and potential merge conflicts between bpf and bpf-next trees. Notice also cur_hist_entry pointer reset to NULL at the beginning of instruction verification loop. This pointer avoids the problem of relying on last jump history entry's insn_idx to determine whether we already have entry for current instruction or not. It can happen that we added jump history entry because current instruction is_jmp_point(), but also we need to add instruction flags for stack access. In this case, we don't want to entries, so we need to reuse last added entry, if it is present. Relying on insn_idx comparison has the same ambiguity problem as the one that was fixed recently in [0], so we avoid that. [0] https://patchwork.kernel.org/project/netdevbpf/patch/20231110002638.4168352-3-andrii@kernel.org/ Acked-by: Eduard Zingerman Reported-by: Tao Lyu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231205184248.1502704-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3378cc753061e..bada59812e003 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -325,12 +325,34 @@ struct bpf_func_state { int allocated_stack; }; -struct bpf_idx_pair { - u32 prev_idx; +#define MAX_CALL_FRAMES 8 + +/* instruction history flags, used in bpf_jmp_history_entry.flags field */ +enum { + /* instruction references stack slot through PTR_TO_STACK register; + * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8) + * and accessed stack slot's index in next 6 bits (MAX_BPF_STACK is 512, + * 8 bytes per slot, so slot index (spi) is [0, 63]) + */ + INSN_F_FRAMENO_MASK = 0x7, /* 3 bits */ + + INSN_F_SPI_MASK = 0x3f, /* 6 bits */ + INSN_F_SPI_SHIFT = 3, /* shifted 3 bits to the left */ + + INSN_F_STACK_ACCESS = BIT(9), /* we need 10 bits total */ +}; + +static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES); +static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8); + +struct bpf_jmp_history_entry { u32 idx; + /* insn idx can't be bigger than 1 million */ + u32 prev_idx : 22; + /* special flags, e.g., whether insn is doing register stack spill/load */ + u32 flags : 10; }; -#define MAX_CALL_FRAMES 8 /* Maximum number of register states that can exist at once */ #define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES) struct bpf_verifier_state { @@ -413,7 +435,7 @@ struct bpf_verifier_state { * For most states jmp_history_cnt is [0-3]. * For loops can go up to ~40. */ - struct bpf_idx_pair *jmp_history; + struct bpf_jmp_history_entry *jmp_history; u32 jmp_history_cnt; u32 dfs_depth; u32 callback_unroll_depth; @@ -656,6 +678,7 @@ struct bpf_verifier_env { int cur_stack; } cfg; struct backtrack_state bt; + struct bpf_jmp_history_entry *cur_hist_ent; u32 pass_cnt; /* number of times do_check() was called */ u32 subprog_cnt; /* number of instructions analyzed by the verifier */ -- cgit v1.2.3 From 0949dd96dffec39683c6066cf8d0877cebc321ec Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 21 Nov 2023 13:44:25 +0000 Subject: drivers: base: Allow parts of GENERIC_CPU_DEVICES to be overridden Architectures often have extra per-cpu work that needs doing before a CPU is registered, often to determine if a CPU is hotpluggable. To allow the ACPI architectures to use GENERIC_CPU_DEVICES, move the cpu_register() call into arch_register_cpu(), which is made __weak so architectures with extra work can override it. This aligns with the way x86, ia64 and loongarch register hotplug CPUs when they become present. Signed-off-by: James Morse Reviewed-by: Shaoqin Huang Reviewed-by: Gavin Shan Signed-off-by: "Russell King (Oracle)" Reviewed-by: Jonathan Cameron Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/E1r5R3B-00Csz6-Uh@rmk-PC.armlinux.org.uk Signed-off-by: Greg Kroah-Hartman --- include/linux/cpu.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index fc8094419084f..1e982d63eae8c 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -88,6 +88,10 @@ extern ssize_t arch_cpu_probe(const char *, size_t); extern ssize_t arch_cpu_release(const char *, size_t); #endif +#ifdef CONFIG_GENERIC_CPU_DEVICES +DECLARE_PER_CPU(struct cpu, cpu_devices); +#endif + /* * These states are not related to the core CPU hotplug mechanism. They are * used by various (sub)architectures to track internal state -- cgit v1.2.3 From bb5e44fb3be685ecb3feb120aca4269a92cc84cf Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 21 Nov 2023 13:44:36 +0000 Subject: drivers: base: add arch_cpu_is_hotpluggable() The differences between architecture specific implementations of arch_register_cpu() are down to whether the CPU is hotpluggable or not. Rather than overriding the weak version of arch_register_cpu(), provide a function that can be used to provide this detail instead. Reviewed-by: Shaoqin Huang Signed-off-by: "Russell King (Oracle)" Reviewed-by: Jonathan Cameron Reviewed-by: Gavin Shan Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/E1r5R3M-00CszH-6r@rmk-PC.armlinux.org.uk Signed-off-by: Greg Kroah-Hartman --- include/linux/cpu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 1e982d63eae8c..dcb89c9871640 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -80,6 +80,7 @@ extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, const struct attribute_group **groups, const char *fmt, ...); +extern bool arch_cpu_is_hotpluggable(int cpu); extern int arch_register_cpu(int cpu); extern void arch_unregister_cpu(int cpu); #ifdef CONFIG_HOTPLUG_CPU -- cgit v1.2.3 From 43a71cd66b9c0a4af3d15d8644359fde35bdbed0 Mon Sep 17 00:00:00 2001 From: Coco Li Date: Mon, 4 Dec 2023 20:12:30 +0000 Subject: net-device: reorganize net_device fast path variables Reorganize fast path variables on tx-txrx-rx order Fastpath variables end after npinfo. Below data generated with pahole on x86 architecture. Fast path variables span cache lines before change: 12 Fast path variables span cache lines after change: 4 Suggested-by: Eric Dumazet Signed-off-by: Coco Li Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20231204201232.520025-2-lixiaoyan@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 117 +++++++++++++++++++++++++--------------------- 1 file changed, 64 insertions(+), 53 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5551177e024e0..cb96aad6a6ee3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2097,6 +2097,70 @@ enum netdev_stat_type { */ struct net_device { + /* Cacheline organization can be found documented in + * Documentation/networking/net_cachelines/net_device.rst. + * Please update the document when adding new fields. + */ + + /* TX read-mostly hotpath */ + __cacheline_group_begin(net_device_read_tx); + unsigned long long priv_flags; + const struct net_device_ops *netdev_ops; + const struct header_ops *header_ops; + struct netdev_queue *_tx; + unsigned int real_num_tx_queues; + unsigned int gso_max_size; + unsigned int gso_ipv4_max_size; + u16 gso_max_segs; + s16 num_tc; + /* Note : dev->mtu is often read without holding a lock. + * Writers usually hold RTNL. + * It is recommended to use READ_ONCE() to annotate the reads, + * and to use WRITE_ONCE() to annotate the writes. + */ + unsigned int mtu; + unsigned short needed_headroom; + struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; +#ifdef CONFIG_XPS + struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX]; +#endif +#ifdef CONFIG_NETFILTER_EGRESS + struct nf_hook_entries __rcu *nf_hooks_egress; +#endif +#ifdef CONFIG_NET_XGRESS + struct bpf_mprog_entry __rcu *tcx_egress; +#endif + __cacheline_group_end(net_device_read_tx); + + /* TXRX read-mostly hotpath */ + __cacheline_group_begin(net_device_read_txrx); + unsigned int flags; + unsigned short hard_header_len; + netdev_features_t features; + struct inet6_dev __rcu *ip6_ptr; + __cacheline_group_end(net_device_read_txrx); + + /* RX read-mostly hotpath */ + __cacheline_group_begin(net_device_read_rx); + struct list_head ptype_specific; + int ifindex; + unsigned int real_num_rx_queues; + struct netdev_rx_queue *_rx; + unsigned long gro_flush_timeout; + int napi_defer_hard_irqs; + unsigned int gro_max_size; + unsigned int gro_ipv4_max_size; + rx_handler_func_t __rcu *rx_handler; + void __rcu *rx_handler_data; + possible_net_t nd_net; +#ifdef CONFIG_NETPOLL + struct netpoll_info __rcu *npinfo; +#endif +#ifdef CONFIG_NET_XGRESS + struct bpf_mprog_entry __rcu *tcx_ingress; +#endif + __cacheline_group_end(net_device_read_rx); + char name[IFNAMSIZ]; struct netdev_name_node *name_node; struct dev_ifalias __rcu *ifalias; @@ -2121,7 +2185,6 @@ struct net_device { struct list_head unreg_list; struct list_head close_list; struct list_head ptype_all; - struct list_head ptype_specific; struct { struct list_head upper; @@ -2129,26 +2192,13 @@ struct net_device { } adj_list; /* Read-mostly cache-line for fast-path access */ - unsigned int flags; xdp_features_t xdp_features; - unsigned long long priv_flags; - const struct net_device_ops *netdev_ops; const struct xdp_metadata_ops *xdp_metadata_ops; const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops; - int ifindex; unsigned short gflags; - unsigned short hard_header_len; - /* Note : dev->mtu is often read without holding a lock. - * Writers usually hold RTNL. - * It is recommended to use READ_ONCE() to annotate the reads, - * and to use WRITE_ONCE() to annotate the writes. - */ - unsigned int mtu; - unsigned short needed_headroom; unsigned short needed_tailroom; - netdev_features_t features; netdev_features_t hw_features; netdev_features_t wanted_features; netdev_features_t vlan_features; @@ -2192,8 +2242,6 @@ struct net_device { const struct tlsdev_ops *tlsdev_ops; #endif - const struct header_ops *header_ops; - unsigned char operstate; unsigned char link_mode; @@ -2234,9 +2282,7 @@ struct net_device { /* Protocol-specific pointers */ - struct in_device __rcu *ip_ptr; - struct inet6_dev __rcu *ip6_ptr; #if IS_ENABLED(CONFIG_VLAN_8021Q) struct vlan_info __rcu *vlan_info; #endif @@ -2271,26 +2317,14 @@ struct net_device { /* Interface address info used in eth_type_trans() */ const unsigned char *dev_addr; - struct netdev_rx_queue *_rx; unsigned int num_rx_queues; - unsigned int real_num_rx_queues; - struct bpf_prog __rcu *xdp_prog; - unsigned long gro_flush_timeout; - int napi_defer_hard_irqs; #define GRO_LEGACY_MAX_SIZE 65536u /* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), * and shinfo->gso_segs is a 16bit field. */ #define GRO_MAX_SIZE (8 * 65535u) - unsigned int gro_max_size; - unsigned int gro_ipv4_max_size; unsigned int xdp_zc_max_segs; - rx_handler_func_t __rcu *rx_handler; - void __rcu *rx_handler_data; -#ifdef CONFIG_NET_XGRESS - struct bpf_mprog_entry __rcu *tcx_ingress; -#endif struct netdev_queue __rcu *ingress_queue; #ifdef CONFIG_NETFILTER_INGRESS struct nf_hook_entries __rcu *nf_hooks_ingress; @@ -2305,25 +2339,13 @@ struct net_device { /* * Cache lines mostly used on transmit path */ - struct netdev_queue *_tx ____cacheline_aligned_in_smp; unsigned int num_tx_queues; - unsigned int real_num_tx_queues; struct Qdisc __rcu *qdisc; unsigned int tx_queue_len; spinlock_t tx_global_lock; struct xdp_dev_bulk_queue __percpu *xdp_bulkq; -#ifdef CONFIG_XPS - struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX]; -#endif -#ifdef CONFIG_NET_XGRESS - struct bpf_mprog_entry __rcu *tcx_egress; -#endif -#ifdef CONFIG_NETFILTER_EGRESS - struct nf_hook_entries __rcu *nf_hooks_egress; -#endif - #ifdef CONFIG_NET_SCHED DECLARE_HASHTABLE (qdisc_hash, 4); #endif @@ -2362,12 +2384,6 @@ struct net_device { bool needs_free_netdev; void (*priv_destructor)(struct net_device *dev); -#ifdef CONFIG_NETPOLL - struct netpoll_info __rcu *npinfo; -#endif - - possible_net_t nd_net; - /* mid-layer private */ void *ml_priv; enum netdev_ml_priv_type ml_priv_type; @@ -2402,20 +2418,15 @@ struct net_device { */ #define GSO_MAX_SIZE (8 * GSO_MAX_SEGS) - unsigned int gso_max_size; #define TSO_LEGACY_MAX_SIZE 65536 #define TSO_MAX_SIZE UINT_MAX unsigned int tso_max_size; - u16 gso_max_segs; #define TSO_MAX_SEGS U16_MAX u16 tso_max_segs; - unsigned int gso_ipv4_max_size; #ifdef CONFIG_DCB const struct dcbnl_rtnl_ops *dcbnl_ops; #endif - s16 num_tc; - struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; u8 prio_tc_map[TC_BITMASK + 1]; #if IS_ENABLED(CONFIG_FCOE) -- cgit v1.2.3 From d5fed5addb2b6bc13035de4338b7ea2052a2e006 Mon Sep 17 00:00:00 2001 From: Coco Li Date: Mon, 4 Dec 2023 20:12:31 +0000 Subject: tcp: reorganize tcp_sock fast path variables The variables are organized according in the following way: - TX read-mostly hotpath cache lines - TXRX read-mostly hotpath cache lines - RX read-mostly hotpath cache lines - TX read-write hotpath cache line - TXRX read-write hotpath cache line - RX read-write hotpath cache line Fastpath cachelines end after rcvq_space. Cache line boundaries are enforced only between read-mostly and read-write. That is, if read-mostly tx cachelines bleed into read-mostly txrx cachelines, we do not care. We care about the boundaries between read and write cachelines because we want to prevent false sharing. Fast path variables span cache lines before change: 12 Fast path variables span cache lines after change: 8 Suggested-by: Eric Dumazet Reviewed-by: Wei Wang Signed-off-by: Coco Li Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20231204201232.520025-3-lixiaoyan@google.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 248 ++++++++++++++++++++++++++++------------------------ 1 file changed, 134 insertions(+), 114 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 68f3d315d2e18..f55ec155f5b71 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -194,23 +194,121 @@ static inline bool tcp_rsk_used_ao(const struct request_sock *req) #define TCP_RMEM_TO_WIN_SCALE 8 struct tcp_sock { + /* Cacheline organization can be found documented in + * Documentation/networking/net_cachelines/tcp_sock.rst. + * Please update the document when adding new fields. + */ + /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; - u16 tcp_header_len; /* Bytes of tcp header to send */ + + /* TX read-mostly hotpath cache lines */ + __cacheline_group_begin(tcp_sock_read_tx); + /* timestamp of last sent data packet (for restart window) */ + u32 max_window; /* Maximal window ever seen from peer */ + u32 rcv_ssthresh; /* Current window clamp */ + u32 reordering; /* Packet reordering metric. */ + u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */ u16 gso_segs; /* Max number of segs per GSO packet */ + /* from STCP, retrans queue hinting */ + struct sk_buff *lost_skb_hint; + struct sk_buff *retransmit_skb_hint; + __cacheline_group_end(tcp_sock_read_tx); + + /* TXRX read-mostly hotpath cache lines */ + __cacheline_group_begin(tcp_sock_read_txrx); + u32 tsoffset; /* timestamp offset */ + u32 snd_wnd; /* The window we expect to receive */ + u32 mss_cache; /* Cached effective mss, not including SACKS */ + u32 snd_cwnd; /* Sending congestion window */ + u32 prr_out; /* Total number of pkts sent during Recovery. */ + u32 lost_out; /* Lost packets */ + u32 sacked_out; /* SACK'd packets */ + u16 tcp_header_len; /* Bytes of tcp header to send */ + u8 chrono_type : 2, /* current chronograph type */ + repair : 1, + is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ + is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ + __cacheline_group_end(tcp_sock_read_txrx); + + /* RX read-mostly hotpath cache lines */ + __cacheline_group_begin(tcp_sock_read_rx); + u32 copied_seq; /* Head of yet unread data */ + u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ + u32 snd_wl1; /* Sequence for window update */ + u32 tlp_high_seq; /* snd_nxt at the time of TLP */ + u32 rttvar_us; /* smoothed mdev_max */ + u32 retrans_out; /* Retransmitted packets out */ + u16 advmss; /* Advertised MSS */ + u16 urg_data; /* Saved octet of OOB data and control flags */ + u32 lost; /* Total data packets lost incl. rexmits */ + struct minmax rtt_min; + /* OOO segments go in this rbtree. Socket lock must be held. */ + struct rb_root out_of_order_queue; + u32 snd_ssthresh; /* Slow start size threshold */ + __cacheline_group_end(tcp_sock_read_rx); + /* TX read-write hotpath cache lines */ + __cacheline_group_begin(tcp_sock_write_tx) ____cacheline_aligned; + u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut + * The total number of segments sent. + */ + u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut + * total number of data segments sent. + */ + u64 bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut + * total number of data bytes sent. + */ + u32 snd_sml; /* Last byte of the most recently transmitted small packet */ + u32 chrono_start; /* Start time in jiffies of a TCP chrono */ + u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ + u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ + u32 pushed_seq; /* Last pushed seq, required to talk to windows */ + u32 lsndtime; + u32 mdev_us; /* medium deviation */ + u64 tcp_wstamp_ns; /* departure time for next sent data packet */ + u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ + u64 tcp_mstamp; /* most recent packet received/sent */ + u32 rtt_seq; /* sequence number to update rttvar */ + struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ + struct sk_buff *highest_sack; /* skb just after the highest + * skb with SACKed bit set + * (validity guaranteed only if + * sacked_out > 0) + */ + u8 ecn_flags; /* ECN status bits. */ + __cacheline_group_end(tcp_sock_write_tx); + + /* TXRX read-write hotpath cache lines */ + __cacheline_group_begin(tcp_sock_write_txrx); /* * Header prediction flags * 0x5?10 << 16 + snd_wnd in net byte order */ __be32 pred_flags; - + u32 rcv_nxt; /* What we want to receive next */ + u32 snd_nxt; /* Next sequence we send */ + u32 snd_una; /* First byte we want an ack for */ + u32 window_clamp; /* Maximal window to advertise */ + u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + u32 packets_out; /* Packets which are "in flight" */ + u32 snd_up; /* Urgent pointer */ + u32 delivered; /* Total data packets delivered incl. rexmits */ + u32 delivered_ce; /* Like the above but only ECE marked packets */ + u32 app_limited; /* limited until "delivered" reaches this val */ + u32 rcv_wnd; /* Current receiver window */ /* - * RFC793 variables by their proper names. This means you can - * read the code and the spec side by side (and laugh ...) - * See RFC793 and RFC1122. The RFC writes these in capitals. + * Options received (usually on last packet, some only on SYN packets). */ - u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived + struct tcp_options_received rx_opt; + u8 nonagle : 4,/* Disable Nagle algorithm? */ + rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ + __cacheline_group_end(tcp_sock_write_txrx); + + /* RX read-write hotpath cache lines */ + __cacheline_group_begin(tcp_sock_write_rx); + u64 bytes_received; + /* RFC4898 tcpEStatsAppHCThruOctetsReceived * sum(delta(rcv_nxt)), or how many bytes * were acked. */ @@ -220,45 +318,44 @@ struct tcp_sock { u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn * total number of data segments in. */ - u32 rcv_nxt; /* What we want to receive next */ - u32 copied_seq; /* Head of yet unread data */ u32 rcv_wup; /* rcv_nxt on last window update sent */ - u32 snd_nxt; /* Next sequence we send */ - u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut - * The total number of segments sent. - */ - u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut - * total number of data segments sent. - */ - u64 bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut - * total number of data bytes sent. - */ + u32 max_packets_out; /* max packets_out in last window */ + u32 cwnd_usage_seq; /* right edge of cwnd usage tracking flight */ + u32 rate_delivered; /* saved rate sample: packets delivered */ + u32 rate_interval_us; /* saved rate sample: time elapsed */ + u32 rcv_rtt_last_tsecr; + u64 first_tx_mstamp; /* start of window send phase */ + u64 delivered_mstamp; /* time we reached "delivered" */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked * sum(delta(snd_una)), or how many bytes * were acked. */ + struct { + u32 rtt_us; + u32 seq; + u64 time; + } rcv_rtt_est; +/* Receiver queue space */ + struct { + u32 space; + u32 seq; + u64 time; + } rcvq_space; + __cacheline_group_end(tcp_sock_write_rx); + /* End of Hot Path */ + +/* + * RFC793 variables by their proper names. This means you can + * read the code and the spec side by side (and laugh ...) + * See RFC793 and RFC1122. The RFC writes these in capitals. + */ u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups * total number of DSACK blocks received */ - u32 snd_una; /* First byte we want an ack for */ - u32 snd_sml; /* Last byte of the most recently transmitted small packet */ - u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ - u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ u32 last_oow_ack_time; /* timestamp of last out-of-window ACK */ u32 compressed_ack_rcv_nxt; - - u32 tsoffset; /* timestamp offset */ - struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ - struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ - - u32 snd_wl1; /* Sequence for window update */ - u32 snd_wnd; /* The window we expect to receive */ - u32 max_window; /* Maximal window ever seen from peer */ - u32 mss_cache; /* Cached effective mss, not including SACKS */ - u32 window_clamp; /* Maximal window to advertise */ - u32 rcv_ssthresh; /* Current window clamp */ u8 scaling_ratio; /* see tcp_win_from_space() */ /* Information of the most recently (s)acked skb */ struct tcp_rack { @@ -272,24 +369,16 @@ struct tcp_sock { dsack_seen:1, /* Whether DSACK seen after last adj */ advanced:1; /* mstamp advanced since last lost marking */ } rack; - u16 advmss; /* Advertised MSS */ u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ tcp_usec_ts:1, /* TSval values in usec */ unused:4; - u32 chrono_start; /* Start time in jiffies of a TCP chrono */ - u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ - u8 chrono_type:2, /* current chronograph type */ - rate_app_limited:1, /* rate_{delivered,interval_us} limited? */ + u8 thin_lto : 1,/* Use linear timeouts for thin streams */ + recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ - is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ - fastopen_client_fail:2; /* reason why fastopen failed */ - u8 nonagle : 4,/* Disable Nagle algorithm? */ - thin_lto : 1,/* Use linear timeouts for thin streams */ - recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ - repair : 1, + fastopen_client_fail:2, /* reason why fastopen failed */ frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ u8 repair_queue; u8 save_syn:2, /* Save headers of SYN packet */ @@ -297,45 +386,19 @@ struct tcp_sock { syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_ch:1, /* Active TFO re-enabling probe */ - syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ - is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ - u32 tlp_high_seq; /* snd_nxt at the time of TLP */ + syn_data_acked:1;/* data in SYN is acked by SYN-ACK */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ - u64 tcp_wstamp_ns; /* departure time for next sent data packet */ - u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ /* RTT measurement */ - u64 tcp_mstamp; /* most recent packet received/sent */ - u32 srtt_us; /* smoothed round trip time << 3 in usecs */ - u32 mdev_us; /* medium deviation */ u32 mdev_max_us; /* maximal mdev for the last rtt period */ - u32 rttvar_us; /* smoothed mdev_max */ - u32 rtt_seq; /* sequence number to update rttvar */ - struct minmax rtt_min; - u32 packets_out; /* Packets which are "in flight" */ - u32 retrans_out; /* Retransmitted packets out */ - u32 max_packets_out; /* max packets_out in last window */ - u32 cwnd_usage_seq; /* right edge of cwnd usage tracking flight */ - - u16 urg_data; /* Saved octet of OOB data and control flags */ - u8 ecn_flags; /* ECN status bits. */ u8 keepalive_probes; /* num of allowed keep alive probes */ - u32 reordering; /* Packet reordering metric. */ u32 reord_seen; /* number of data packet reordering events */ - u32 snd_up; /* Urgent pointer */ - -/* - * Options received (usually on last packet, some only on SYN packets). - */ - struct tcp_options_received rx_opt; /* * Slow start and congestion control (see also Nagle, and Karn & Partridge) */ - u32 snd_ssthresh; /* Slow start size threshold */ - u32 snd_cwnd; /* Sending congestion window */ u32 snd_cwnd_cnt; /* Linear increase counter */ u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ u32 snd_cwnd_used; @@ -343,32 +406,10 @@ struct tcp_sock { u32 prior_cwnd; /* cwnd right before starting loss recovery */ u32 prr_delivered; /* Number of newly delivered packets to * receiver in Recovery. */ - u32 prr_out; /* Total number of pkts sent during Recovery. */ - u32 delivered; /* Total data packets delivered incl. rexmits */ - u32 delivered_ce; /* Like the above but only ECE marked packets */ - u32 lost; /* Total data packets lost incl. rexmits */ - u32 app_limited; /* limited until "delivered" reaches this val */ - u64 first_tx_mstamp; /* start of window send phase */ - u64 delivered_mstamp; /* time we reached "delivered" */ - u32 rate_delivered; /* saved rate sample: packets delivered */ - u32 rate_interval_us; /* saved rate sample: time elapsed */ - - u32 rcv_wnd; /* Current receiver window */ - u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ - u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */ - u32 pushed_seq; /* Last pushed seq, required to talk to windows */ - u32 lost_out; /* Lost packets */ - u32 sacked_out; /* SACK'd packets */ struct hrtimer pacing_timer; struct hrtimer compressed_ack_timer; - /* from STCP, retrans queue hinting */ - struct sk_buff* lost_skb_hint; - struct sk_buff *retransmit_skb_hint; - - /* OOO segments go in this rbtree. Socket lock must be held. */ - struct rb_root out_of_order_queue; struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */ /* SACKs data, these 2 need to be together (see tcp_options_write) */ @@ -377,12 +418,6 @@ struct tcp_sock { struct tcp_sack_block recv_sack_cache[4]; - struct sk_buff *highest_sack; /* skb just after the highest - * skb with SACKed bit set - * (validity guaranteed only if - * sacked_out > 0) - */ - int lost_cnt_hint; u32 prior_ssthresh; /* ssthresh saved at recovery start */ @@ -433,21 +468,6 @@ struct tcp_sock { u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */ -/* Receiver side RTT estimation */ - u32 rcv_rtt_last_tsecr; - struct { - u32 rtt_us; - u32 seq; - u64 time; - } rcv_rtt_est; - -/* Receiver queue space */ - struct { - u32 space; - u32 seq; - u64 time; - } rcvq_space; - /* TCP-specific MTU probe information. */ struct { u32 probe_seq_start; -- cgit v1.2.3 From facd15dfd69122042502d99ab8c9f888b48ee994 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 4 Dec 2023 21:47:07 +0100 Subject: net: core: synchronize link-watch when carrier is queried There are multiple ways to query for the carrier state: through rtnetlink, sysfs, and (possibly) ethtool. Synchronize linkwatch work before these operations so that we don't have a situation where userspace queries the carrier state between the driver's carrier off->on transition and linkwatch running and expects it to work, when really (at least) TX cannot work until linkwatch has run. I previously posted a longer explanation of how this applies to wireless [1] but with this wireless can simply query the state before sending data, to ensure the kernel is ready for it. [1] https://lore.kernel.org/all/346b21d87c69f817ea3c37caceb34f1f56255884.camel@sipsolutions.net/ Signed-off-by: Johannes Berg Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20231204214706.303c62768415.I1caedccae72ee5a45c9085c5eb49c145ce1c0dd5@changeid Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cb96aad6a6ee3..1b935ee341b42 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4229,6 +4229,15 @@ static inline void netdev_ref_replace(struct net_device *odev, */ void linkwatch_fire_event(struct net_device *dev); +/** + * linkwatch_sync_dev - sync linkwatch for the given device + * @dev: network device to sync linkwatch for + * + * Sync linkwatch for the given device, removing it from the + * pending work list (if queued). + */ +void linkwatch_sync_dev(struct net_device *dev); + /** * netif_carrier_ok - test if carrier present * @dev: network device -- cgit v1.2.3 From 7037d95a047cd89b1f680eed253c6ab586bef1ed Mon Sep 17 00:00:00 2001 From: Kelly Kane Date: Sat, 2 Dec 2023 17:17:12 -0800 Subject: r8152: add vendor/device ID pair for ASUS USB-C2500 The ASUS USB-C2500 is an RTL8156 based 2.5G Ethernet controller. Add the vendor and product ID values to the driver. This makes Ethernet work with the adapter. Signed-off-by: Kelly Kane Link: https://lore.kernel.org/r/20231203011712.6314-1-kelly@hawknetworks.com Signed-off-by: Paolo Abeni --- include/linux/usb/r8152.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb/r8152.h b/include/linux/usb/r8152.h index 287e9d83fb8bc..33a4c146dc19c 100644 --- a/include/linux/usb/r8152.h +++ b/include/linux/usb/r8152.h @@ -30,6 +30,7 @@ #define VENDOR_ID_NVIDIA 0x0955 #define VENDOR_ID_TPLINK 0x2357 #define VENDOR_ID_DLINK 0x2001 +#define VENDOR_ID_ASUS 0x0b05 #if IS_REACHABLE(CONFIG_USB_RTL8152) extern u8 rtl8152_get_version(struct usb_interface *intf); -- cgit v1.2.3 From a3a44d2d3a5c5ff6e73c711db5b1911b5a676bb0 Mon Sep 17 00:00:00 2001 From: Even Xu Date: Tue, 5 Dec 2023 09:50:30 +0800 Subject: HID: Intel-ish-hid: Ishtp: Add helper functions for client connection For every ishtp client driver during initialization state, the flow is: 1 - Allocate an ISHTP client instance 2 - Reserve a host id and link the client instance 3 - Search a firmware client using UUID and get related client information 4 - Bind firmware client id to the ISHTP client instance 5 - Set the state the ISHTP client instance to CONNECTING 6 - Send connect request to firmware 7 - Register event callback for messages from the firmware During deinitizalization state, the flow is: 9 - Set the state the ISHTP client instance to ISHTP_CL_DISCONNECTING 10 - Issue disconnect request to firmware 11 - Unlike the client instance 12 - Flush message queue 13 - Free ISHTP client instance Step 2-7 are identical to the steps of client driver initialization and driver reset flow, but reallocation of the RX/TX ring buffers can be avoided in reset flow. Also for step 9-12, they are identical to the steps of client driver failure handling after connect request, driver reset flow and driver removing. So, add two helper functions to simplify client driver code. ishtp_cl_establish_connection() ishtp_cl_destroy_connection() No functional changes are expected. Signed-off-by: Even Xu Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- include/linux/intel-ish-client-if.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h index f45f13304addd..771622650247a 100644 --- a/include/linux/intel-ish-client-if.h +++ b/include/linux/intel-ish-client-if.h @@ -94,6 +94,9 @@ int ishtp_cl_link(struct ishtp_cl *cl); void ishtp_cl_unlink(struct ishtp_cl *cl); int ishtp_cl_disconnect(struct ishtp_cl *cl); int ishtp_cl_connect(struct ishtp_cl *cl); +int ishtp_cl_establish_connection(struct ishtp_cl *cl, const guid_t *uuid, + int tx_size, int rx_size, bool reset); +void ishtp_cl_destroy_connection(struct ishtp_cl *cl, bool reset); int ishtp_cl_send(struct ishtp_cl *cl, uint8_t *buf, size_t length); int ishtp_cl_flush_queues(struct ishtp_cl *cl); int ishtp_cl_io_rb_recycle(struct ishtp_cl_rb *rb); -- cgit v1.2.3 From 16a1d968358aa9e897ce995fa45cb15d55a0e83d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 2 Oct 2023 20:43:43 +0200 Subject: mm/slab: remove mm/slab.c and slab_def.h Remove the SLAB implementation. Update CREDITS. Also update and properly sort the SLOB entry there. RIP SLAB allocator (1996 - 2024) Reviewed-by: Kees Cook Acked-by: Christoph Lameter Acked-by: David Rientjes Tested-by: David Rientjes Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- include/linux/slab_def.h | 124 ----------------------------------------------- 1 file changed, 124 deletions(-) delete mode 100644 include/linux/slab_def.h (limited to 'include/linux') diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h deleted file mode 100644 index a61e7d55d0d30..0000000000000 --- a/include/linux/slab_def.h +++ /dev/null @@ -1,124 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_SLAB_DEF_H -#define _LINUX_SLAB_DEF_H - -#include -#include - -/* - * Definitions unique to the original Linux SLAB allocator. - */ - -struct kmem_cache { - struct array_cache __percpu *cpu_cache; - -/* 1) Cache tunables. Protected by slab_mutex */ - unsigned int batchcount; - unsigned int limit; - unsigned int shared; - - unsigned int size; - struct reciprocal_value reciprocal_buffer_size; -/* 2) touched by every alloc & free from the backend */ - - slab_flags_t flags; /* constant flags */ - unsigned int num; /* # of objs per slab */ - -/* 3) cache_grow/shrink */ - /* order of pgs per slab (2^n) */ - unsigned int gfporder; - - /* force GFP flags, e.g. GFP_DMA */ - gfp_t allocflags; - - size_t colour; /* cache colouring range */ - unsigned int colour_off; /* colour offset */ - unsigned int freelist_size; - - /* constructor func */ - void (*ctor)(void *obj); - -/* 4) cache creation/removal */ - const char *name; - struct list_head list; - int refcount; - int object_size; - int align; - -/* 5) statistics */ -#ifdef CONFIG_DEBUG_SLAB - unsigned long num_active; - unsigned long num_allocations; - unsigned long high_mark; - unsigned long grown; - unsigned long reaped; - unsigned long errors; - unsigned long max_freeable; - unsigned long node_allocs; - unsigned long node_frees; - unsigned long node_overflow; - atomic_t allochit; - atomic_t allocmiss; - atomic_t freehit; - atomic_t freemiss; - - /* - * If debugging is enabled, then the allocator can add additional - * fields and/or padding to every object. 'size' contains the total - * object size including these internal fields, while 'obj_offset' - * and 'object_size' contain the offset to the user object and its - * size. - */ - int obj_offset; -#endif /* CONFIG_DEBUG_SLAB */ - -#ifdef CONFIG_KASAN_GENERIC - struct kasan_cache kasan_info; -#endif - -#ifdef CONFIG_SLAB_FREELIST_RANDOM - unsigned int *random_seq; -#endif - -#ifdef CONFIG_HARDENED_USERCOPY - unsigned int useroffset; /* Usercopy region offset */ - unsigned int usersize; /* Usercopy region size */ -#endif - - struct kmem_cache_node *node[MAX_NUMNODES]; -}; - -static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab, - void *x) -{ - void *object = x - (x - slab->s_mem) % cache->size; - void *last_object = slab->s_mem + (cache->num - 1) * cache->size; - - if (unlikely(object > last_object)) - return last_object; - else - return object; -} - -/* - * We want to avoid an expensive divide : (offset / cache->size) - * Using the fact that size is a constant for a particular cache, - * we can replace (offset / cache->size) by - * reciprocal_divide(offset, cache->reciprocal_buffer_size) - */ -static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct slab *slab, void *obj) -{ - u32 offset = (obj - slab->s_mem); - return reciprocal_divide(offset, cache->reciprocal_buffer_size); -} - -static inline int objs_per_slab(const struct kmem_cache *cache, - const struct slab *slab) -{ - if (is_kfence_address(slab_address(slab))) - return 1; - return cache->num; -} - -#endif /* _LINUX_SLAB_DEF_H */ -- cgit v1.2.3 From 7ef08ae8277c66657127844179912214c67fb4bc Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 3 Oct 2023 09:54:15 +0200 Subject: mm/slab: move struct kmem_cache_cpu declaration to slub.c Nothing outside SLUB itself accesses the struct kmem_cache_cpu fields so it does not need to be declared in slub_def.h. This allows also to move enum stat_item. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- include/linux/slub_def.h | 54 ------------------------------------------------ 1 file changed, 54 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index deb90cf4bffb0..a0229ea429770 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -12,60 +12,6 @@ #include #include -enum stat_item { - ALLOC_FASTPATH, /* Allocation from cpu slab */ - ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ - FREE_FASTPATH, /* Free to cpu slab */ - FREE_SLOWPATH, /* Freeing not to cpu slab */ - FREE_FROZEN, /* Freeing to frozen slab */ - FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ - FREE_REMOVE_PARTIAL, /* Freeing removes last object */ - ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ - ALLOC_SLAB, /* Cpu slab acquired from page allocator */ - ALLOC_REFILL, /* Refill cpu slab from slab freelist */ - ALLOC_NODE_MISMATCH, /* Switching cpu slab */ - FREE_SLAB, /* Slab freed to the page allocator */ - CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ - DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ - DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ - DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ - DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ - DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ - DEACTIVATE_BYPASS, /* Implicit deactivation */ - ORDER_FALLBACK, /* Number of times fallback was necessary */ - CMPXCHG_DOUBLE_CPU_FAIL,/* Failure of this_cpu_cmpxchg_double */ - CMPXCHG_DOUBLE_FAIL, /* Number of times that cmpxchg double did not match */ - CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ - CPU_PARTIAL_FREE, /* Refill cpu partial on free */ - CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ - CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ - NR_SLUB_STAT_ITEMS -}; - -#ifndef CONFIG_SLUB_TINY -/* - * When changing the layout, make sure freelist and tid are still compatible - * with this_cpu_cmpxchg_double() alignment requirements. - */ -struct kmem_cache_cpu { - union { - struct { - void **freelist; /* Pointer to next available object */ - unsigned long tid; /* Globally unique transaction id */ - }; - freelist_aba_t freelist_tid; - }; - struct slab *slab; /* The slab from which we are allocating */ -#ifdef CONFIG_SLUB_CPU_PARTIAL - struct slab *partial; /* Partially allocated frozen slabs */ -#endif - local_lock_t lock; /* Protects the fields above */ -#ifdef CONFIG_SLUB_STATS - unsigned stat[NR_SLUB_STAT_ITEMS]; -#endif -}; -#endif /* CONFIG_SLUB_TINY */ - #ifdef CONFIG_SLUB_CPU_PARTIAL #define slub_percpu_partial(c) ((c)->partial) -- cgit v1.2.3 From 19975f83412fbb9b1458f3dfbf16ca043a57788a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 3 Oct 2023 09:59:48 +0200 Subject: mm/slab: move the rest of slub_def.h to mm/slab.h mm/slab.h is the only place to include include/linux/slub_def.h which has allowed switching between SLAB and SLUB. Now we can simply move the contents over and remove slub_def.h. Use this opportunity to fix up some whitespace (alignment) issues. Reviewed-by: Kees Cook Acked-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- include/linux/slub_def.h | 150 ----------------------------------------------- 1 file changed, 150 deletions(-) delete mode 100644 include/linux/slub_def.h (limited to 'include/linux') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h deleted file mode 100644 index a0229ea429770..0000000000000 --- a/include/linux/slub_def.h +++ /dev/null @@ -1,150 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_SLUB_DEF_H -#define _LINUX_SLUB_DEF_H - -/* - * SLUB : A Slab allocator without object queues. - * - * (C) 2007 SGI, Christoph Lameter - */ -#include -#include -#include -#include - -#ifdef CONFIG_SLUB_CPU_PARTIAL -#define slub_percpu_partial(c) ((c)->partial) - -#define slub_set_percpu_partial(c, p) \ -({ \ - slub_percpu_partial(c) = (p)->next; \ -}) - -#define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c)) -#else -#define slub_percpu_partial(c) NULL - -#define slub_set_percpu_partial(c, p) - -#define slub_percpu_partial_read_once(c) NULL -#endif // CONFIG_SLUB_CPU_PARTIAL - -/* - * Word size structure that can be atomically updated or read and that - * contains both the order and the number of objects that a slab of the - * given order would contain. - */ -struct kmem_cache_order_objects { - unsigned int x; -}; - -/* - * Slab cache management. - */ -struct kmem_cache { -#ifndef CONFIG_SLUB_TINY - struct kmem_cache_cpu __percpu *cpu_slab; -#endif - /* Used for retrieving partial slabs, etc. */ - slab_flags_t flags; - unsigned long min_partial; - unsigned int size; /* The size of an object including metadata */ - unsigned int object_size;/* The size of an object without metadata */ - struct reciprocal_value reciprocal_size; - unsigned int offset; /* Free pointer offset */ -#ifdef CONFIG_SLUB_CPU_PARTIAL - /* Number of per cpu partial objects to keep around */ - unsigned int cpu_partial; - /* Number of per cpu partial slabs to keep around */ - unsigned int cpu_partial_slabs; -#endif - struct kmem_cache_order_objects oo; - - /* Allocation and freeing of slabs */ - struct kmem_cache_order_objects min; - gfp_t allocflags; /* gfp flags to use on each alloc */ - int refcount; /* Refcount for slab cache destroy */ - void (*ctor)(void *); - unsigned int inuse; /* Offset to metadata */ - unsigned int align; /* Alignment */ - unsigned int red_left_pad; /* Left redzone padding size */ - const char *name; /* Name (only for display!) */ - struct list_head list; /* List of slab caches */ -#ifdef CONFIG_SYSFS - struct kobject kobj; /* For sysfs */ -#endif -#ifdef CONFIG_SLAB_FREELIST_HARDENED - unsigned long random; -#endif - -#ifdef CONFIG_NUMA - /* - * Defragmentation by allocating from a remote node. - */ - unsigned int remote_node_defrag_ratio; -#endif - -#ifdef CONFIG_SLAB_FREELIST_RANDOM - unsigned int *random_seq; -#endif - -#ifdef CONFIG_KASAN_GENERIC - struct kasan_cache kasan_info; -#endif - -#ifdef CONFIG_HARDENED_USERCOPY - unsigned int useroffset; /* Usercopy region offset */ - unsigned int usersize; /* Usercopy region size */ -#endif - - struct kmem_cache_node *node[MAX_NUMNODES]; -}; - -#if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY) -#define SLAB_SUPPORTS_SYSFS -void sysfs_slab_unlink(struct kmem_cache *); -void sysfs_slab_release(struct kmem_cache *); -#else -static inline void sysfs_slab_unlink(struct kmem_cache *s) -{ -} -static inline void sysfs_slab_release(struct kmem_cache *s) -{ -} -#endif - -void *fixup_red_left(struct kmem_cache *s, void *p); - -static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *slab, - void *x) { - void *object = x - (x - slab_address(slab)) % cache->size; - void *last_object = slab_address(slab) + - (slab->objects - 1) * cache->size; - void *result = (unlikely(object > last_object)) ? last_object : object; - - result = fixup_red_left(cache, result); - return result; -} - -/* Determine object index from a given position */ -static inline unsigned int __obj_to_index(const struct kmem_cache *cache, - void *addr, void *obj) -{ - return reciprocal_divide(kasan_reset_tag(obj) - addr, - cache->reciprocal_size); -} - -static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct slab *slab, void *obj) -{ - if (is_kfence_address(obj)) - return 0; - return __obj_to_index(cache, slab_address(slab), obj); -} - -static inline int objs_per_slab(const struct kmem_cache *cache, - const struct slab *slab) -{ - return slab->objects; -} -#endif /* _LINUX_SLUB_DEF_H */ -- cgit v1.2.3 From 9396c4ee93f9ac03cd0cea0bb345fbc657772943 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 4 Dec 2023 19:00:44 +0000 Subject: net/tcp: Don't store TCP-AO maclen on reqsk This extra check doesn't work for a handshake when SYN segment has (current_key.maclen != rnext_key.maclen). It could be amended to preserve rnext_key.maclen instead of current_key.maclen, but that requires a lookup on listen socket. Originally, this extra maclen check was introduced just because it was cheap. Drop it and convert tcp_request_sock::maclen into boolean tcp_request_sock::used_tcp_ao. Fixes: 06b22ef29591 ("net/tcp: Wire TCP-AO to request sockets") Signed-off-by: Dmitry Safonov Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/tcp.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 68f3d315d2e18..b646b574b060d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -169,7 +169,7 @@ struct tcp_request_sock { #ifdef CONFIG_TCP_AO u8 ao_keyid; u8 ao_rcv_next; - u8 maclen; + bool used_tcp_ao; #endif }; @@ -180,14 +180,10 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) static inline bool tcp_rsk_used_ao(const struct request_sock *req) { - /* The real length of MAC is saved in the request socket, - * signing anything with zero-length makes no sense, so here is - * a little hack.. - */ #ifndef CONFIG_TCP_AO return false; #else - return tcp_rsk(req)->maclen != 0; + return tcp_rsk(req)->used_tcp_ao; #endif } -- cgit v1.2.3 From 16e5ac127d8d18adf85fe5ba847d77b58d1ed418 Mon Sep 17 00:00:00 2001 From: Naresh Solanki Date: Tue, 5 Dec 2023 16:22:04 +0530 Subject: regulator: event: Add regulator netlink event support This commit introduces netlink event support to the regulator subsystem. Changes: - Introduce event.c and regnl.h for netlink event handling. - Implement reg_generate_netlink_event to broadcast regulator events. - Update Makefile to include the new event.c file. Signed-off-by: Naresh Solanki Link: https://lore.kernel.org/r/20231205105207.1262928-1-naresh.solanki@9elements.com Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 47 +------------------------------------- 1 file changed, 1 insertion(+), 46 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 39b666b40ea61..4660582a33022 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -33,6 +33,7 @@ #include #include +#include struct device; struct notifier_block; @@ -84,52 +85,6 @@ struct regulator_dev; #define REGULATOR_MODE_IDLE 0x4 #define REGULATOR_MODE_STANDBY 0x8 -/* - * Regulator notifier events. - * - * UNDER_VOLTAGE Regulator output is under voltage. - * OVER_CURRENT Regulator output current is too high. - * REGULATION_OUT Regulator output is out of regulation. - * FAIL Regulator output has failed. - * OVER_TEMP Regulator over temp. - * FORCE_DISABLE Regulator forcibly shut down by software. - * VOLTAGE_CHANGE Regulator voltage changed. - * Data passed is old voltage cast to (void *). - * DISABLE Regulator was disabled. - * PRE_VOLTAGE_CHANGE Regulator is about to have voltage changed. - * Data passed is "struct pre_voltage_change_data" - * ABORT_VOLTAGE_CHANGE Regulator voltage change failed for some reason. - * Data passed is old voltage cast to (void *). - * PRE_DISABLE Regulator is about to be disabled - * ABORT_DISABLE Regulator disable failed for some reason - * - * NOTE: These events can be OR'ed together when passed into handler. - */ - -#define REGULATOR_EVENT_UNDER_VOLTAGE 0x01 -#define REGULATOR_EVENT_OVER_CURRENT 0x02 -#define REGULATOR_EVENT_REGULATION_OUT 0x04 -#define REGULATOR_EVENT_FAIL 0x08 -#define REGULATOR_EVENT_OVER_TEMP 0x10 -#define REGULATOR_EVENT_FORCE_DISABLE 0x20 -#define REGULATOR_EVENT_VOLTAGE_CHANGE 0x40 -#define REGULATOR_EVENT_DISABLE 0x80 -#define REGULATOR_EVENT_PRE_VOLTAGE_CHANGE 0x100 -#define REGULATOR_EVENT_ABORT_VOLTAGE_CHANGE 0x200 -#define REGULATOR_EVENT_PRE_DISABLE 0x400 -#define REGULATOR_EVENT_ABORT_DISABLE 0x800 -#define REGULATOR_EVENT_ENABLE 0x1000 -/* - * Following notifications should be emitted only if detected condition - * is such that the HW is likely to still be working but consumers should - * take a recovery action to prevent problems esacalating into errors. - */ -#define REGULATOR_EVENT_UNDER_VOLTAGE_WARN 0x2000 -#define REGULATOR_EVENT_OVER_CURRENT_WARN 0x4000 -#define REGULATOR_EVENT_OVER_VOLTAGE_WARN 0x8000 -#define REGULATOR_EVENT_OVER_TEMP_WARN 0x10000 -#define REGULATOR_EVENT_WARN_MASK 0x1E000 - /* * Regulator errors that can be queried using regulator_get_error_flags * -- cgit v1.2.3 From 15bece7bec0df91a8ed1c185483d67708425ca8e Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Fri, 24 Nov 2023 20:16:15 +0800 Subject: cpu/hotplug: Remove unused CPU hotplug states There are unused hotplug states which either have never been used or the removal of the usage did not remove the state constant. Drop them to reduce the size of the cpuhp_hp_states array. Signed-off-by: Zenghui Yu Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20231124121615.1604-1-yuzenghui@huawei.com --- include/linux/cpuhotplug.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index efc0c0b07efb4..af6c21aab9859 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -66,15 +66,12 @@ enum cpuhp_state { CPUHP_PERF_POWER, CPUHP_PERF_SUPERH, CPUHP_X86_HPET_DEAD, - CPUHP_X86_APB_DEAD, CPUHP_X86_MCE_DEAD, CPUHP_VIRT_NET_DEAD, CPUHP_IBMVNIC_DEAD, CPUHP_SLUB_DEAD, CPUHP_DEBUG_OBJ_DEAD, CPUHP_MM_WRITEBACK_DEAD, - /* Must be after CPUHP_MM_VMSTAT_DEAD */ - CPUHP_MM_DEMOTION_DEAD, CPUHP_MM_VMSTAT_DEAD, CPUHP_SOFTIRQ_DEAD, CPUHP_NET_MVNETA_DEAD, @@ -96,7 +93,6 @@ enum cpuhp_state { CPUHP_NET_DEV_DEAD, CPUHP_PCI_XGENE_DEAD, CPUHP_IOMMU_IOVA_DEAD, - CPUHP_LUSTRE_CFS_DEAD, CPUHP_AP_ARM_CACHE_B15_RAC_DEAD, CPUHP_PADATA_DEAD, CPUHP_AP_DTPM_CPU_DEAD, @@ -118,7 +114,6 @@ enum cpuhp_state { CPUHP_XEN_EVTCHN_PREPARE, CPUHP_ARM_SHMOBILE_SCU_PREPARE, CPUHP_SH_SH3X_PREPARE, - CPUHP_NET_FLOW_PREPARE, CPUHP_TOPOLOGY_PREPARE, CPUHP_NET_IUCV_PREPARE, CPUHP_ARM_BL_PREPARE, @@ -151,18 +146,14 @@ enum cpuhp_state { CPUHP_AP_IRQ_ARMADA_XP_STARTING, CPUHP_AP_IRQ_BCM2836_STARTING, CPUHP_AP_IRQ_MIPS_GIC_STARTING, - CPUHP_AP_IRQ_RISCV_STARTING, CPUHP_AP_IRQ_LOONGARCH_STARTING, CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING, CPUHP_AP_ARM_MVEBU_COHERENCY, - CPUHP_AP_MICROCODE_LOADER, CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, CPUHP_AP_PERF_X86_STARTING, CPUHP_AP_PERF_X86_AMD_IBS_STARTING, - CPUHP_AP_PERF_X86_CQM_STARTING, CPUHP_AP_PERF_X86_CSTATE_STARTING, CPUHP_AP_PERF_XTENSA_STARTING, - CPUHP_AP_MIPS_OP_LOONGSON3_STARTING, CPUHP_AP_ARM_VFP_STARTING, CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING, CPUHP_AP_PERF_ARM_HW_BREAKPOINT_STARTING, @@ -179,7 +170,6 @@ enum cpuhp_state { CPUHP_AP_QCOM_TIMER_STARTING, CPUHP_AP_TEGRA_TIMER_STARTING, CPUHP_AP_ARMADA_TIMER_STARTING, - CPUHP_AP_MARCO_TIMER_STARTING, CPUHP_AP_MIPS_GIC_TIMER_STARTING, CPUHP_AP_ARC_TIMER_STARTING, CPUHP_AP_RISCV_TIMER_STARTING, @@ -217,9 +207,7 @@ enum cpuhp_state { CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE, CPUHP_AP_PERF_X86_AMD_POWER_ONLINE, CPUHP_AP_PERF_X86_RAPL_ONLINE, - CPUHP_AP_PERF_X86_CQM_ONLINE, CPUHP_AP_PERF_X86_CSTATE_ONLINE, - CPUHP_AP_PERF_X86_IDXD_ONLINE, CPUHP_AP_PERF_S390_CF_ONLINE, CPUHP_AP_PERF_S390_SF_ONLINE, CPUHP_AP_PERF_ARM_CCI_ONLINE, @@ -252,8 +240,6 @@ enum cpuhp_state { CPUHP_AP_BASE_CACHEINFO_ONLINE, CPUHP_AP_ONLINE_DYN, CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, - /* Must be after CPUHP_AP_ONLINE_DYN for node_states[N_CPU] update */ - CPUHP_AP_MM_DEMOTION_ONLINE, CPUHP_AP_X86_HPET_ONLINE, CPUHP_AP_X86_KVM_CLK_ONLINE, CPUHP_AP_ACTIVE, -- cgit v1.2.3 From 57b8543ceee82ea72be1745a6dc3a9111d55a151 Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Thu, 23 Nov 2023 15:36:13 +0530 Subject: ACPI: bus: update acpi_dev_uid_match() to support multiple types According to the ACPI specification, a _UID object can evaluate to either a numeric value or a string. Update acpi_dev_uid_match() to support _UID matching for both integer and string types. Suggested-by: Mika Westerberg Signed-off-by: Raag Jadav [ rjw: Rename auxiliary macros, relocate kerneldoc comment ] Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4db54e928b36d..2abe81f074deb 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -756,6 +756,9 @@ const char *acpi_get_subsystem_id(acpi_handle handle); #define ACPI_HANDLE(dev) (NULL) #define ACPI_HANDLE_FWNODE(fwnode) (NULL) +/* Get rid of the -Wunused-variable for adev */ +#define acpi_dev_uid_match(adev, uid2) (adev && false) + #include struct fwnode_handle; @@ -772,11 +775,6 @@ static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv) struct acpi_device; -static inline bool acpi_dev_uid_match(struct acpi_device *adev, const char *uid2) -{ - return false; -} - static inline bool acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2) { -- cgit v1.2.3 From b2b32a1738815155d4a0039bb7a6092d40f23e81 Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Thu, 23 Nov 2023 15:36:14 +0530 Subject: ACPI: bus: update acpi_dev_hid_uid_match() to support multiple types Now that we have _UID matching support for both integer and string types, we can support them into acpi_dev_hid_uid_match() helper as well. Signed-off-by: Raag Jadav Reviewed-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 2abe81f074deb..75274585656c3 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -758,6 +758,7 @@ const char *acpi_get_subsystem_id(acpi_handle handle); /* Get rid of the -Wunused-variable for adev */ #define acpi_dev_uid_match(adev, uid2) (adev && false) +#define acpi_dev_hid_uid_match(adev, hid2, uid2) (adev && false) #include @@ -775,12 +776,6 @@ static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv) struct acpi_device; -static inline bool -acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2) -{ - return false; -} - static inline int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer) { return -ENODEV; -- cgit v1.2.3 From 40bba140c60fbb3ee8df6203c82fbd3de9f19d95 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:14 -0800 Subject: bpf: add BPF token delegation mount options to BPF FS Add few new mount options to BPF FS that allow to specify that a given BPF FS instance allows creation of BPF token (added in the next patch), and what sort of operations are allowed under BPF token. As such, we get 4 new mount options, each is a bit mask - `delegate_cmds` allow to specify which bpf() syscall commands are allowed with BPF token derived from this BPF FS instance; - if BPF_MAP_CREATE command is allowed, `delegate_maps` specifies a set of allowable BPF map types that could be created with BPF token; - if BPF_PROG_LOAD command is allowed, `delegate_progs` specifies a set of allowable BPF program types that could be loaded with BPF token; - if BPF_PROG_LOAD command is allowed, `delegate_attachs` specifies a set of allowable BPF program attach types that could be loaded with BPF token; delegate_progs and delegate_attachs are meant to be used together, as full BPF program type is, in general, determined through both program type and program attach type. Currently, these mount options accept the following forms of values: - a special value "any", that enables all possible values of a given bit set; - numeric value (decimal or hexadecimal, determined by kernel automatically) that specifies a bit mask value directly; - all the values for a given mount option are combined, if specified multiple times. E.g., `mount -t bpf nodev /path/to/mount -o delegate_maps=0x1 -o delegate_maps=0x2` will result in a combined 0x3 mask. Ideally, more convenient (for humans) symbolic form derived from corresponding UAPI enums would be accepted (e.g., `-o delegate_progs=kprobe|tracepoint`) and I intend to implement this, but it requires a bunch of UAPI header churn, so I postponed it until this feature lands upstream or at least there is a definite consensus that this feature is acceptable and is going to make it, just to minimize amount of wasted effort and not increase amount of non-essential code to be reviewed. Attentive reader will notice that BPF FS is now marked as FS_USERNS_MOUNT, which theoretically makes it mountable inside non-init user namespace as long as the process has sufficient *namespaced* capabilities within that user namespace. But in reality we still restrict BPF FS to be mountable only by processes with CAP_SYS_ADMIN *in init userns* (extra check in bpf_fill_super()). FS_USERNS_MOUNT is added to allow creating BPF FS context object (i.e., fsopen("bpf")) from inside unprivileged process inside non-init userns, to capture that userns as the owning userns. It will still be required to pass this context object back to privileged process to instantiate and mount it. This manipulation is important, because capturing non-init userns as the owning userns of BPF FS instance (super block) allows to use that userns to constraint BPF token to that userns later on (see next patch). So creating BPF FS with delegation inside unprivileged userns will restrict derived BPF token objects to only "work" inside that intended userns, making it scoped to a intended "container". Also, setting these delegation options requires capable(CAP_SYS_ADMIN), so unprivileged process cannot set this up without involvement of a privileged process. There is a set of selftests at the end of the patch set that simulates this sequence of steps and validates that everything works as intended. But careful review is requested to make sure there are no missed gaps in the implementation and testing. This somewhat subtle set of aspects is the result of previous discussions ([0]) about various user namespace implications and interactions with BPF token functionality and is necessary to contain BPF token inside intended user namespace. [0] https://lore.kernel.org/bpf/20230704-hochverdient-lehne-eeb9eeef785e@brauner/ Acked-by: Christian Brauner Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 10e5e4d8a00fa..d3c9acc593eaa 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1581,6 +1581,16 @@ struct bpf_link_primer { u32 id; }; +struct bpf_mount_opts { + umode_t mode; + + /* BPF token-related delegation options */ + u64 delegate_cmds; + u64 delegate_maps; + u64 delegate_progs; + u64 delegate_attachs; +}; + struct bpf_struct_ops_value; struct btf_member; -- cgit v1.2.3 From 4527358b76861dfd64ee34aba45d81648fbc8a61 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:15 -0800 Subject: bpf: introduce BPF token object Add new kind of BPF kernel object, BPF token. BPF token is meant to allow delegating privileged BPF functionality, like loading a BPF program or creating a BPF map, from privileged process to a *trusted* unprivileged process, all while having a good amount of control over which privileged operations could be performed using provided BPF token. This is achieved through mounting BPF FS instance with extra delegation mount options, which determine what operations are delegatable, and also constraining it to the owning user namespace (as mentioned in the previous patch). BPF token itself is just a derivative from BPF FS and can be created through a new bpf() syscall command, BPF_TOKEN_CREATE, which accepts BPF FS FD, which can be attained through open() API by opening BPF FS mount point. Currently, BPF token "inherits" delegated command, map types, prog type, and attach type bit sets from BPF FS as is. In the future, having an BPF token as a separate object with its own FD, we can allow to further restrict BPF token's allowable set of things either at the creation time or after the fact, allowing the process to guard itself further from unintentionally trying to load undesired kind of BPF programs. But for now we keep things simple and just copy bit sets as is. When BPF token is created from BPF FS mount, we take reference to the BPF super block's owning user namespace, and then use that namespace for checking all the {CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN} capabilities that are normally only checked against init userns (using capable()), but now we check them using ns_capable() instead (if BPF token is provided). See bpf_token_capable() for details. Such setup means that BPF token in itself is not sufficient to grant BPF functionality. User namespaced process has to *also* have necessary combination of capabilities inside that user namespace. So while previously CAP_BPF was useless when granted within user namespace, now it gains a meaning and allows container managers and sys admins to have a flexible control over which processes can and need to use BPF functionality within the user namespace (i.e., container in practice). And BPF FS delegation mount options and derived BPF tokens serve as a per-container "flag" to grant overall ability to use bpf() (plus further restrict on which parts of bpf() syscalls are treated as namespaced). Note also, BPF_TOKEN_CREATE command itself requires ns_capable(CAP_BPF) within the BPF FS owning user namespace, rounding up the ns_capable() story of BPF token. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d3c9acc593eaa..aa9cf8e5fab16 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -51,6 +51,10 @@ struct module; struct bpf_func_state; struct ftrace_ops; struct cgroup; +struct bpf_token; +struct user_namespace; +struct super_block; +struct inode; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -1591,6 +1595,13 @@ struct bpf_mount_opts { u64 delegate_attachs; }; +struct bpf_token { + struct work_struct work; + atomic64_t refcnt; + struct user_namespace *userns; + u64 allowed_cmds; +}; + struct bpf_struct_ops_value; struct btf_member; @@ -2048,6 +2059,7 @@ static inline void bpf_enable_instrumentation(void) migrate_enable(); } +extern const struct super_operations bpf_super_ops; extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; @@ -2182,6 +2194,8 @@ static inline void bpf_map_dec_elem_count(struct bpf_map *map) extern int sysctl_unprivileged_bpf_disabled; +bool bpf_token_capable(const struct bpf_token *token, int cap); + static inline bool bpf_allow_ptr_leaks(void) { return perfmon_capable(); @@ -2216,8 +2230,17 @@ int bpf_link_new_fd(struct bpf_link *link); struct bpf_link *bpf_link_get_from_fd(u32 ufd); struct bpf_link *bpf_link_get_curr_or_next(u32 *id); +void bpf_token_inc(struct bpf_token *token); +void bpf_token_put(struct bpf_token *token); +int bpf_token_create(union bpf_attr *attr); +struct bpf_token *bpf_token_get_from_fd(u32 ufd); + +bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); + int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); +struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir, + umode_t mode); #define BPF_ITER_FUNC_PREFIX "bpf_iter_" #define DEFINE_BPF_ITER_FUNC(target, args...) \ @@ -2580,6 +2603,24 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags) return -EOPNOTSUPP; } +static inline bool bpf_token_capable(const struct bpf_token *token, int cap) +{ + return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN)); +} + +static inline void bpf_token_inc(struct bpf_token *token) +{ +} + +static inline void bpf_token_put(struct bpf_token *token) +{ +} + +static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd) +{ + return ERR_PTR(-EOPNOTSUPP); +} + static inline void __dev_flush(void) { } -- cgit v1.2.3 From 688b7270b3cb75e8ac78123d719967db40336e5b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:16 -0800 Subject: bpf: add BPF token support to BPF_MAP_CREATE command Allow providing token_fd for BPF_MAP_CREATE command to allow controlled BPF map creation from unprivileged process through delegated BPF token. Wire through a set of allowed BPF map types to BPF token, derived from BPF FS at BPF token creation time. This, in combination with allowed_cmds allows to create a narrowly-focused BPF token (controlled by privileged agent) with a restrictive set of BPF maps that application can attempt to create. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index aa9cf8e5fab16..e08e8436df38b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1600,6 +1600,7 @@ struct bpf_token { atomic64_t refcnt; struct user_namespace *userns; u64 allowed_cmds; + u64 allowed_maps; }; struct bpf_struct_ops_value; @@ -2236,6 +2237,7 @@ int bpf_token_create(union bpf_attr *attr); struct bpf_token *bpf_token_get_from_fd(u32 ufd); bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); +bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type); int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); -- cgit v1.2.3 From e1cef620f598853a90f17701fcb1057a6768f7b8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:18 -0800 Subject: bpf: add BPF token support to BPF_PROG_LOAD command Add basic support of BPF token to BPF_PROG_LOAD. Wire through a set of allowed BPF program types and attach types, derived from BPF FS at BPF token creation time. Then make sure we perform bpf_token_capable() checks everywhere where it's relevant. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e08e8436df38b..20af87b59d709 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1461,6 +1461,7 @@ struct bpf_prog_aux { #ifdef CONFIG_SECURITY void *security; #endif + struct bpf_token *token; struct bpf_prog_offload *offload; struct btf *btf; struct bpf_func_info *func_info; @@ -1601,6 +1602,8 @@ struct bpf_token { struct user_namespace *userns; u64 allowed_cmds; u64 allowed_maps; + u64 allowed_progs; + u64 allowed_attachs; }; struct bpf_struct_ops_value; @@ -2238,6 +2241,9 @@ struct bpf_token *bpf_token_get_from_fd(u32 ufd); bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type); +bool bpf_token_allow_prog_type(const struct bpf_token *token, + enum bpf_prog_type prog_type, + enum bpf_attach_type attach_type); int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); -- cgit v1.2.3 From 4cbb270e115bc197ff2046aeb54cc951666b16ec Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:19 -0800 Subject: bpf: take into account BPF token when fetching helper protos Instead of performing unconditional system-wide bpf_capable() and perfmon_capable() calls inside bpf_base_func_proto() function (and other similar ones) to determine eligibility of a given BPF helper for a given program, use previously recorded BPF token during BPF_PROG_LOAD command handling to inform the decision. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-8-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 20af87b59d709..2a3ab4f3dd8cb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2492,7 +2492,8 @@ const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type struct bpf_prog *bpf_prog_by_id(u32 id); struct bpf_link *bpf_link_by_id(u32 id); -const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); +const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog); void bpf_task_storage_free(struct task_struct *task); void bpf_cgrp_storage_free(struct cgroup *cgroup); bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); @@ -2752,7 +2753,7 @@ static inline int btf_struct_access(struct bpf_verifier_log *log, } static inline const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id) +bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { return NULL; } -- cgit v1.2.3 From 8062fb12de99b2da33754c6a3be1bfc30d9a35f4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:20 -0800 Subject: bpf: consistently use BPF token throughout BPF verifier logic Remove remaining direct queries to perfmon_capable() and bpf_capable() in BPF verifier logic and instead use BPF token (if available) to make decisions about privileges. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-9-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 16 ++++++++-------- include/linux/filter.h | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2a3ab4f3dd8cb..435abad3cc61e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2200,24 +2200,24 @@ extern int sysctl_unprivileged_bpf_disabled; bool bpf_token_capable(const struct bpf_token *token, int cap); -static inline bool bpf_allow_ptr_leaks(void) +static inline bool bpf_allow_ptr_leaks(const struct bpf_token *token) { - return perfmon_capable(); + return bpf_token_capable(token, CAP_PERFMON); } -static inline bool bpf_allow_uninit_stack(void) +static inline bool bpf_allow_uninit_stack(const struct bpf_token *token) { - return perfmon_capable(); + return bpf_token_capable(token, CAP_PERFMON); } -static inline bool bpf_bypass_spec_v1(void) +static inline bool bpf_bypass_spec_v1(const struct bpf_token *token) { - return cpu_mitigations_off() || perfmon_capable(); + return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON); } -static inline bool bpf_bypass_spec_v4(void) +static inline bool bpf_bypass_spec_v4(const struct bpf_token *token) { - return cpu_mitigations_off() || perfmon_capable(); + return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON); } int bpf_map_new_fd(struct bpf_map *map, int flags); diff --git a/include/linux/filter.h b/include/linux/filter.h index a4953fafc8cb8..14354605ad269 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1139,7 +1139,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) return false; if (!bpf_jit_harden) return false; - if (bpf_jit_harden == 1 && bpf_capable()) + if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF)) return false; return true; -- cgit v1.2.3 From c3dd6e94df7193f33f45d33303f5e85afb2a72dc Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:21 -0800 Subject: bpf,lsm: refactor bpf_prog_alloc/bpf_prog_free LSM hooks Based on upstream discussion ([0]), rework existing bpf_prog_alloc_security LSM hook. Rename it to bpf_prog_load and instead of passing bpf_prog_aux, pass proper bpf_prog pointer for a full BPF program struct. Also, we pass bpf_attr union with all the user-provided arguments for BPF_PROG_LOAD command. This will give LSMs as much information as we can basically provide. The hook is also BPF token-aware now, and optional bpf_token struct is passed as a third argument. bpf_prog_load LSM hook is called after a bunch of sanity checks were performed, bpf_prog and bpf_prog_aux were allocated and filled out, but right before performing full-fledged BPF verification step. bpf_prog_free LSM hook is now accepting struct bpf_prog argument, for consistency. SELinux code is adjusted to all new names, types, and signatures. Note, given that bpf_prog_load (previously bpf_prog_alloc) hook can be used by some LSMs to allocate extra security blob, but also by other LSMs to reject BPF program loading, we need to make sure that bpf_prog_free LSM hook is called after bpf_prog_load/bpf_prog_alloc one *even* if the hook itself returned error. If we don't do that, we run the risk of leaking memory. This seems to be possible today when combining SELinux and BPF LSM, as one example, depending on their relative ordering. Also, for BPF LSM setup, add bpf_prog_load and bpf_prog_free to sleepable LSM hooks list, as they are both executed in sleepable context. Also drop bpf_prog_load hook from untrusted, as there is no issue with refcount or anything else anymore, that originally forced us to add it to untrusted list in c0c852dd1876 ("bpf: Do not mark certain LSM hook arguments as trusted"). We now trigger this hook much later and it should not be an issue anymore. [0] https://lore.kernel.org/bpf/9fe88aef7deabbe87d3fc38c4aea3c69.paul@paul-moore.com/ Acked-by: Paul Moore Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-10-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/lsm_hook_defs.h | 5 +++-- include/linux/security.h | 12 +++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index ff217a5ce5521..41ec4a7c070e2 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -400,8 +400,9 @@ LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode) LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog) LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map) LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map) -LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux) -LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux *aux) +LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, + struct bpf_token *token) +LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) #endif /* CONFIG_BPF_SYSCALL */ LSM_HOOK(int, 0, locked_down, enum lockdown_reason what) diff --git a/include/linux/security.h b/include/linux/security.h index 1d1df326c881c..65467eef6678c 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2020,15 +2020,16 @@ static inline void securityfs_remove(struct dentry *dentry) union bpf_attr; struct bpf_map; struct bpf_prog; -struct bpf_prog_aux; +struct bpf_token; #ifdef CONFIG_SECURITY extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size); extern int security_bpf_map(struct bpf_map *map, fmode_t fmode); extern int security_bpf_prog(struct bpf_prog *prog); extern int security_bpf_map_alloc(struct bpf_map *map); extern void security_bpf_map_free(struct bpf_map *map); -extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux); -extern void security_bpf_prog_free(struct bpf_prog_aux *aux); +extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, + struct bpf_token *token); +extern void security_bpf_prog_free(struct bpf_prog *prog); #else static inline int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) @@ -2054,12 +2055,13 @@ static inline int security_bpf_map_alloc(struct bpf_map *map) static inline void security_bpf_map_free(struct bpf_map *map) { } -static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux) +static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, + struct bpf_token *token) { return 0; } -static inline void security_bpf_prog_free(struct bpf_prog_aux *aux) +static inline void security_bpf_prog_free(struct bpf_prog *prog) { } #endif /* CONFIG_SECURITY */ #endif /* CONFIG_BPF_SYSCALL */ -- cgit v1.2.3 From 66d636d70a79c1d37e3eea67ab50969e6aaef983 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:22 -0800 Subject: bpf,lsm: refactor bpf_map_alloc/bpf_map_free LSM hooks Similarly to bpf_prog_alloc LSM hook, rename and extend bpf_map_alloc hook into bpf_map_create, taking not just struct bpf_map, but also bpf_attr and bpf_token, to give a fuller context to LSMs. Unlike bpf_prog_alloc, there is no need to move the hook around, as it currently is firing right before allocating BPF map ID and FD, which seems to be a sweet spot. But like bpf_prog_alloc/bpf_prog_free combo, make sure that bpf_map_free LSM hook is called even if bpf_map_create hook returned error, as if few LSMs are combined together it could be that one LSM successfully allocated security blob for its needs, while subsequent LSM rejected BPF map creation. The former LSM would still need to free up LSM blob, so we need to ensure security_bpf_map_free() is called regardless of the outcome. Acked-by: Paul Moore Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-11-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/lsm_hook_defs.h | 5 +++-- include/linux/security.h | 6 ++++-- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 41ec4a7c070e2..adb25cc63ce3b 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -398,8 +398,9 @@ LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule) LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size) LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode) LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog) -LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map) -LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map) +LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr, + struct bpf_token *token) +LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map) LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token) LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) diff --git a/include/linux/security.h b/include/linux/security.h index 65467eef6678c..08fd777cbe94c 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2025,7 +2025,8 @@ struct bpf_token; extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size); extern int security_bpf_map(struct bpf_map *map, fmode_t fmode); extern int security_bpf_prog(struct bpf_prog *prog); -extern int security_bpf_map_alloc(struct bpf_map *map); +extern int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, + struct bpf_token *token); extern void security_bpf_map_free(struct bpf_map *map); extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token); @@ -2047,7 +2048,8 @@ static inline int security_bpf_prog(struct bpf_prog *prog) return 0; } -static inline int security_bpf_map_alloc(struct bpf_map *map) +static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, + struct bpf_token *token) { return 0; } -- cgit v1.2.3 From d734ca7b33dbf60eb15dcf7c44f3da7073356777 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:23 -0800 Subject: bpf,lsm: add BPF token LSM hooks Wire up bpf_token_create and bpf_token_free LSM hooks, which allow to allocate LSM security blob (we add `void *security` field to struct bpf_token for that), but also control who can instantiate BPF token. This follows existing pattern for BPF map and BPF prog. Also add security_bpf_token_allow_cmd() and security_bpf_token_capable() LSM hooks that allow LSM implementation to control and negate (if necessary) BPF token's delegation of a specific bpf_cmd and capability, respectively. Acked-by: Paul Moore Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-12-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ include/linux/lsm_hook_defs.h | 5 +++++ include/linux/security.h | 25 +++++++++++++++++++++++++ 3 files changed, 33 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 435abad3cc61e..7a483f6b6d5f9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1604,6 +1604,9 @@ struct bpf_token { u64 allowed_maps; u64 allowed_progs; u64 allowed_attachs; +#ifdef CONFIG_SECURITY + void *security; +#endif }; struct bpf_struct_ops_value; diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index adb25cc63ce3b..3fdd00b452aca 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -404,6 +404,11 @@ LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map) LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token) LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) +LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr, + struct path *path) +LSM_HOOK(void, LSM_RET_VOID, bpf_token_free, struct bpf_token *token) +LSM_HOOK(int, 0, bpf_token_cmd, const struct bpf_token *token, enum bpf_cmd cmd) +LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap) #endif /* CONFIG_BPF_SYSCALL */ LSM_HOOK(int, 0, locked_down, enum lockdown_reason what) diff --git a/include/linux/security.h b/include/linux/security.h index 08fd777cbe94c..00809d2d5c38c 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -32,6 +32,7 @@ #include #include #include +#include struct linux_binprm; struct cred; @@ -2031,6 +2032,11 @@ extern void security_bpf_map_free(struct bpf_map *map); extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token); extern void security_bpf_prog_free(struct bpf_prog *prog); +extern int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, + struct path *path); +extern void security_bpf_token_free(struct bpf_token *token); +extern int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd); +extern int security_bpf_token_capable(const struct bpf_token *token, int cap); #else static inline int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) @@ -2065,6 +2071,25 @@ static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr * static inline void security_bpf_prog_free(struct bpf_prog *prog) { } + +static inline int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, + struct path *path) +{ + return 0; +} + +static inline void security_bpf_token_free(struct bpf_token *token) +{ } + +static inline int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd) +{ + return 0; +} + +static inline int security_bpf_token_capable(const struct bpf_token *token, int cap) +{ + return 0; +} #endif /* CONFIG_SECURITY */ #endif /* CONFIG_BPF_SYSCALL */ -- cgit v1.2.3 From 3232e7aad11e541da86bbb1fa5ea5737b30bd006 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 5 Dec 2023 17:21:14 -0500 Subject: cgroup/cpuset: Include isolated cpuset CPUs in cpu_is_isolated() check Currently, the cpu_is_isolated() function checks only the statically isolated CPUs specified via the "isolcpus" and "nohz_full" kernel command line options. This function is used by vmstat and memcg to reduce interference with isolated CPUs by not doing stat flushing or scheduling works on those CPUs. Workloads running on isolated CPUs within isolated cpuset partitions should receive the same treatment to reduce unnecessary interference. This patch introduces a new cpuset_cpu_is_isolated() function to be called by cpu_is_isolated() so that the set of dynamically created cpuset isolated CPUs will be included in the check. Assuming that testing a bit in a cpumask is atomic, no synchronization primitive is currently used to synchronize access to the cpuset's isolated_cpus mask. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cpuset.h | 6 ++++++ include/linux/sched/isolation.h | 4 +++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index d629094fac6e6..875d12598bd2d 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -77,6 +77,7 @@ extern void cpuset_lock(void); extern void cpuset_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); +extern bool cpuset_cpu_is_isolated(int cpu); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); @@ -207,6 +208,11 @@ static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p) return false; } +static inline bool cpuset_cpu_is_isolated(int cpu) +{ + return false; +} + static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) { return node_possible_map; diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index fe1a46f30d240..2b461129d1fad 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -2,6 +2,7 @@ #define _LINUX_SCHED_ISOLATION_H #include +#include #include #include @@ -67,7 +68,8 @@ static inline bool housekeeping_cpu(int cpu, enum hk_type type) static inline bool cpu_is_isolated(int cpu) { return !housekeeping_test_cpu(cpu, HK_TYPE_DOMAIN) || - !housekeeping_test_cpu(cpu, HK_TYPE_TICK); + !housekeeping_test_cpu(cpu, HK_TYPE_TICK) || + cpuset_cpu_is_isolated(cpu); } #endif /* _LINUX_SCHED_ISOLATION_H */ -- cgit v1.2.3 From be0a3600aa1ebe9d23243c91d41ab1a2d5091a9b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 5 Dec 2023 13:24:08 +0100 Subject: thermal: sysfs: Rework the handling of trip point updates Both trip_point_temp_store() and trip_point_hyst_store() use thermal_zone_set_trip() to update a given trip point, but none of them actually needs to change more than one field in struct thermal_trip representing it. However, each of them effectively calls __thermal_zone_get_trip() twice in a row for the same trip index value, once directly and once via thermal_zone_set_trip(), which is not particularly efficient, and the way in which thermal_zone_set_trip() carries out the update is not particularly straightforward. Moreover, input processing need not be done under the thermal zone lock in any of these functions. Rework trip_point_temp_store() and trip_point_hyst_store() to address the above, move the part of thermal_zone_set_trip() that is still useful to a new function called thermal_zone_trip_updated() and drop the rest of it. While at it, make trip_point_hyst_store() reject negative hysteresis values. Signed-off-by: Rafael J. Wysocki Reviewed-by: Daniel Lezcano --- include/linux/thermal.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 1f9ee869f9f9c..0ea99f50d57c5 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -282,10 +282,6 @@ int __thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id, struct thermal_trip *trip); int thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id, struct thermal_trip *trip); - -int thermal_zone_set_trip(struct thermal_zone_device *tz, int trip_id, - const struct thermal_trip *trip); - int for_each_thermal_trip(struct thermal_zone_device *tz, int (*cb)(struct thermal_trip *, void *), void *data); -- cgit v1.2.3 From 4b7de801606e504e69689df71475d27e35336fb3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 6 Dec 2023 09:30:40 +0100 Subject: bpf: Fix prog_array_map_poke_run map poke update Lee pointed out issue found by syscaller [0] hitting BUG in prog array map poke update in prog_array_map_poke_run function due to error value returned from bpf_arch_text_poke function. There's race window where bpf_arch_text_poke can fail due to missing bpf program kallsym symbols, which is accounted for with check for -EINVAL in that BUG_ON call. The problem is that in such case we won't update the tail call jump and cause imbalance for the next tail call update check which will fail with -EBUSY in bpf_arch_text_poke. I'm hitting following race during the program load: CPU 0 CPU 1 bpf_prog_load bpf_check do_misc_fixups prog_array_map_poke_track map_update_elem bpf_fd_array_map_update_elem prog_array_map_poke_run bpf_arch_text_poke returns -EINVAL bpf_prog_kallsyms_add After bpf_arch_text_poke (CPU 1) fails to update the tail call jump, the next poke update fails on expected jump instruction check in bpf_arch_text_poke with -EBUSY and triggers the BUG_ON in prog_array_map_poke_run. Similar race exists on the program unload. Fixing this by moving the update to bpf_arch_poke_desc_update function which makes sure we call __bpf_arch_text_poke that skips the bpf address check. Each architecture has slightly different approach wrt looking up bpf address in bpf_arch_text_poke, so instead of splitting the function or adding new 'checkip' argument in previous version, it seems best to move the whole map_poke_run update as arch specific code. [0] https://syzkaller.appspot.com/bug?extid=97a4fe20470e9bc30810 Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT") Reported-by: syzbot+97a4fe20470e9bc30810@syzkaller.appspotmail.com Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Cc: Lee Jones Cc: Maciej Fijalkowski Link: https://lore.kernel.org/bpf/20231206083041.1306660-2-jolsa@kernel.org --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6762dac3ef761..cff5bb08820ec 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3175,6 +3175,9 @@ enum bpf_text_poke_type { int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); +void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, + struct bpf_prog *new, struct bpf_prog *old); + void *bpf_arch_text_copy(void *dst, void *src, size_t len); int bpf_arch_text_invalidate(void *dst, size_t len); -- cgit v1.2.3 From 187da0f8250aa94bd96266096aef6f694e0b4cd2 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 13 Nov 2023 17:20:33 -0800 Subject: hugetlb: fix null-ptr-deref in hugetlb_vma_lock_write The routine __vma_private_lock tests for the existence of a reserve map associated with a private hugetlb mapping. A pointer to the reserve map is in vma->vm_private_data. __vma_private_lock was checking the pointer for NULL. However, it is possible that the low bits of the pointer could be used as flags. In such instances, vm_private_data is not NULL and not a valid pointer. This results in the null-ptr-deref reported by syzbot: general protection fault, probably for non-canonical address 0xdffffc000000001d: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x00000000000000e8-0x00000000000000ef] CPU: 0 PID: 5048 Comm: syz-executor139 Not tainted 6.6.0-rc7-syzkaller-00142-g88 8cf78c29e2 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 1 0/09/2023 RIP: 0010:__lock_acquire+0x109/0x5de0 kernel/locking/lockdep.c:5004 ... Call Trace: lock_acquire kernel/locking/lockdep.c:5753 [inline] lock_acquire+0x1ae/0x510 kernel/locking/lockdep.c:5718 down_write+0x93/0x200 kernel/locking/rwsem.c:1573 hugetlb_vma_lock_write mm/hugetlb.c:300 [inline] hugetlb_vma_lock_write+0xae/0x100 mm/hugetlb.c:291 __hugetlb_zap_begin+0x1e9/0x2b0 mm/hugetlb.c:5447 hugetlb_zap_begin include/linux/hugetlb.h:258 [inline] unmap_vmas+0x2f4/0x470 mm/memory.c:1733 exit_mmap+0x1ad/0xa60 mm/mmap.c:3230 __mmput+0x12a/0x4d0 kernel/fork.c:1349 mmput+0x62/0x70 kernel/fork.c:1371 exit_mm kernel/exit.c:567 [inline] do_exit+0x9ad/0x2a20 kernel/exit.c:861 __do_sys_exit kernel/exit.c:991 [inline] __se_sys_exit kernel/exit.c:989 [inline] __x64_sys_exit+0x42/0x50 kernel/exit.c:989 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Mask off low bit flags before checking for NULL pointer. In addition, the reserve map only 'belongs' to the OWNER (parent in parent/child relationships) so also check for the OWNER flag. Link: https://lkml.kernel.org/r/20231114012033.259600-1-mike.kravetz@oracle.com Reported-by: syzbot+6ada951e7c0f7bc8a71e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-mm/00000000000078d1e00608d7878b@google.com/ Fixes: bf4916922c60 ("hugetlbfs: extend hugetlb_vma_lock to private VMAs") Signed-off-by: Mike Kravetz Reviewed-by: Rik van Riel Cc: Edward Adam Davis Cc: Muchun Song Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Tom Rix Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d3acecc5db4b3..236ec7b63c541 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1268,10 +1268,7 @@ static inline bool __vma_shareable_lock(struct vm_area_struct *vma) return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data; } -static inline bool __vma_private_lock(struct vm_area_struct *vma) -{ - return (!(vma->vm_flags & VM_MAYSHARE)) && vma->vm_private_data; -} +bool __vma_private_lock(struct vm_area_struct *vma); /* * Safe version of huge_pte_offset() to check the locks. See comments -- cgit v1.2.3 From 8e92157d7f6190c86bfd6144a409001469827100 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 28 Nov 2023 19:44:03 +0200 Subject: units: add missing header BITS_PER_BYTE is defined in bits.h. Link: https://lkml.kernel.org/r/20231128174404.393393-1-andriy.shevchenko@linux.intel.com Fixes: e8eed5f7366f ("units: Add BYTES_PER_*BIT") Signed-off-by: Andy Shevchenko Reviewed-by: Randy Dunlap Cc: Damian Muszynski Cc: Rasmus Villemoes Cc: Herbert Xu Signed-off-by: Andrew Morton --- include/linux/units.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/units.h b/include/linux/units.h index ff1bd6b5f5b37..45110daaf8d32 100644 --- a/include/linux/units.h +++ b/include/linux/units.h @@ -2,6 +2,7 @@ #ifndef _LINUX_UNITS_H #define _LINUX_UNITS_H +#include #include /* Metric prefixes in accordance with Système international (d'unités) */ -- cgit v1.2.3 From 73424d00dc63ba681856e06cfb0a5abbdb62e2b5 Mon Sep 17 00:00:00 2001 From: Su Hui Date: Thu, 30 Nov 2023 11:40:18 +0800 Subject: highmem: fix a memory copy problem in memcpy_from_folio Clang static checker complains that value stored to 'from' is never read. And memcpy_from_folio() only copy the last chunk memory from folio to destination. Use 'to += chunk' to replace 'from += chunk' to fix this typo problem. Link: https://lkml.kernel.org/r/20231130034017.1210429-1-suhui@nfschina.com Fixes: b23d03ef7af5 ("highmem: add memcpy_to_folio() and memcpy_from_folio()") Signed-off-by: Su Hui Reviewed-by: Matthew Wilcox (Oracle) Cc: Ira Weiny Cc: Jiaqi Yan Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Peter Collingbourne Cc: Tom Rix Cc: Tony Luck Cc: Signed-off-by: Andrew Morton --- include/linux/highmem.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 4cacc0e43b513..be20cff4ba737 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -454,7 +454,7 @@ static inline void memcpy_from_folio(char *to, struct folio *folio, memcpy(to, from, chunk); kunmap_local(from); - from += chunk; + to += chunk; offset += chunk; len -= chunk; } while (len > 0); -- cgit v1.2.3 From f08a1c658257c73697a819c4ded3a84b6f0ead74 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:48 -0800 Subject: bpf: Let bpf_prog_pack_free handle any pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, bpf_prog_pack_free only can only free pointer to struct bpf_binary_header, which is not flexible. Add a size argument to bpf_prog_pack_free so that it can handle any pointer. Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Reviewed-by: Björn Töpel Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20231206224054.492250-2-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 14354605ad269..12d907f17d364 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1067,7 +1067,7 @@ struct bpf_binary_header * bpf_jit_binary_pack_hdr(const struct bpf_prog *fp); void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns); -void bpf_prog_pack_free(struct bpf_binary_header *hdr); +void bpf_prog_pack_free(void *ptr, u32 size); static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { -- cgit v1.2.3 From 7a3d9a159b178e87306a6e989071ed9a114a1a31 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:49 -0800 Subject: bpf: Adjust argument names of arch_prepare_bpf_trampoline() We are using "im" for "struct bpf_tramp_image" and "tr" for "struct bpf_trampoline" in most of the code base. The only exception is the prototype and fallback version of arch_prepare_bpf_trampoline(). Update them to match the rest of the code base. We mix "orig_call" and "func_addr" for the argument in different versions of arch_prepare_bpf_trampoline(). s/orig_call/func_addr/g so they match. Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20231206224054.492250-3-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7a483f6b6d5f9..17eb6d905204a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1098,10 +1098,10 @@ struct bpf_tramp_run_ctx; * fexit = a set of program to run after original function */ struct bpf_tramp_image; -int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, - void *orig_call); + void *func_addr); u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, -- cgit v1.2.3 From 82583daa2efc2e336962b231a46bad03a280b3e0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:50 -0800 Subject: bpf: Add helpers for trampoline image management As BPF trampoline of different archs moves from bpf_jit_[alloc|free]_exec() to bpf_prog_pack_[alloc|free](), we need to use different _alloc, _free for different archs during the transition. Add the following helpers for this transition: void *arch_alloc_bpf_trampoline(unsigned int size); void arch_free_bpf_trampoline(void *image, unsigned int size); void arch_protect_bpf_trampoline(void *image, unsigned int size); void arch_unprotect_bpf_trampoline(void *image, unsigned int size); The fallback version of these helpers require size <= PAGE_SIZE, but they are only called with size == PAGE_SIZE. They will be called with size < PAGE_SIZE when arch_bpf_trampoline_size() helper is introduced later. Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20231206224054.492250-4-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 17eb6d905204a..b7fca151cf1b2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1102,6 +1102,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr); +void *arch_alloc_bpf_trampoline(unsigned int size); +void arch_free_bpf_trampoline(void *image, unsigned int size); +void arch_protect_bpf_trampoline(void *image, unsigned int size); +void arch_unprotect_bpf_trampoline(void *image, unsigned int size); + u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, -- cgit v1.2.3 From 96d1b7c081c0c96cbe8901045f4ff15a2e9974a2 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:52 -0800 Subject: bpf: Add arch_bpf_trampoline_size() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helper will be used to calculate the size of the trampoline before allocating the memory. arch_prepare_bpf_trampoline() for arm64 and riscv64 can use arch_bpf_trampoline_size() to check the trampoline fits in the image. OTOH, arch_prepare_bpf_trampoline() for s390 has to call the JIT process twice, so it cannot use arch_bpf_trampoline_size(). Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Acked-by: Jiri Olsa Acked-by: Björn Töpel Tested-by: Björn Töpel # on riscv Link: https://lore.kernel.org/r/20231206224054.492250-6-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b7fca151cf1b2..2332ddeb396bd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1106,6 +1106,8 @@ void *arch_alloc_bpf_trampoline(unsigned int size); void arch_free_bpf_trampoline(void *image, unsigned int size); void arch_protect_bpf_trampoline(void *image, unsigned int size); void arch_unprotect_bpf_trampoline(void *image, unsigned int size); +int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, + struct bpf_tramp_links *tlinks, void *func_addr); u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); -- cgit v1.2.3 From 26ef208c209a0e6eed8942a5d191b39dccfa6e38 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:53 -0800 Subject: bpf: Use arch_bpf_trampoline_size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of blindly allocating PAGE_SIZE for each trampoline, check the size of the trampoline with arch_bpf_trampoline_size(). This size is saved in bpf_tramp_image->size, and used for modmem charge/uncharge. The fallback arch_alloc_bpf_trampoline() still allocates a whole page because we need to use set_memory_* to protect the memory. struct_ops trampoline still uses a whole page for multiple trampolines. With this size check at caller (regular trampoline and struct_ops trampoline), remove arch_bpf_trampoline_size() from arch_prepare_bpf_trampoline() in archs. Also, update bpf_image_ksym_add() to handle symbol of different sizes. Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Acked-by: Jiri Olsa Acked-by: Björn Töpel Tested-by: Björn Töpel # on riscv Link: https://lore.kernel.org/r/20231206224054.492250-7-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2332ddeb396bd..c1a06263a4f36 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1141,6 +1141,7 @@ enum bpf_tramp_prog_type { struct bpf_tramp_image { void *image; + int size; struct bpf_ksym ksym; struct percpu_ref pcref; void *ip_after_call; @@ -1325,7 +1326,7 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to); /* Called only from JIT-enabled code, so there's no need for stubs. */ -void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); +void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); -- cgit v1.2.3 From f922b16aa5fad7284e2b7fd7c22bab13c0e418b6 Mon Sep 17 00:00:00 2001 From: Jay Buddhabhatti Date: Wed, 29 Nov 2023 03:27:09 -0800 Subject: firmware: xilinx: Update firmware call interface to support additional args System-level platform management layer (do_fw_call()) has support for maximum of 5 arguments as of now (1 EEMI API ID + 4 command arguments). In order to support new EEMI PM_IOCTL IDs (Secure Read/Write), this support must be extended to support one additional argument, which results in a configuration of - 1 EEMI API ID + 5 command arguments. Update zynqmp_pm_invoke_fn() and do_fw_call() with this new definition containing variable arguments. As a result, update all the references to pm invoke function with the updated definition. Co-developed-by: Izhar Ameer Shaikh Signed-off-by: Izhar Ameer Shaikh Signed-off-by: Jay Buddhabhatti Link: https://lore.kernel.org/r/20231129112713.22718-2-jay.buddhabhatti@amd.com Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware/xlnx-zynqmp.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index d1ea3898564ca..41190bbed8edf 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -509,8 +509,7 @@ struct zynqmp_pm_query_data { u32 arg3; }; -int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 arg0, u32 arg1, - u32 arg2, u32 arg3, u32 *ret_payload); +int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 *ret_payload, u32 num_args, ...); #if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE) int zynqmp_pm_get_api_version(u32 *version); -- cgit v1.2.3 From f689a0ca45fcdf4139727a3a02a49efbb1902306 Mon Sep 17 00:00:00 2001 From: Jay Buddhabhatti Date: Wed, 29 Nov 2023 03:27:10 -0800 Subject: firmware: xilinx: Expand feature check to support all PLM modules To support feature check for all modules, append the module id of the API that is being checked to the feature check API so it could be routed to the target module for processing. There is no need to check compatible string because the board information is taken via firmware interface. Co-developed-by: Saeed Nowshadi Signed-off-by: Saeed Nowshadi Signed-off-by: Jay Buddhabhatti Link: https://lore.kernel.org/r/20231129112713.22718-3-jay.buddhabhatti@amd.com Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware/xlnx-zynqmp.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 41190bbed8edf..e9a7fece5efef 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -32,6 +32,7 @@ #define PM_SIP_SVC 0xC2000000 /* PM API versions */ +#define PM_API_VERSION_1 1 #define PM_API_VERSION_2 2 #define PM_PINCTRL_PARAM_SET_VERSION 2 @@ -47,6 +48,9 @@ #define FAMILY_CODE_MASK GENMASK(27, 21) #define SUB_FAMILY_CODE_MASK GENMASK(20, 19) +#define API_ID_MASK GENMASK(7, 0) +#define MODULE_ID_MASK GENMASK(11, 8) + /* ATF only commands */ #define TF_A_PM_REGISTER_SGI 0xa04 #define PM_GET_TRUSTZONE_VERSION 0xa03 @@ -112,6 +116,12 @@ #define XPM_EVENT_ERROR_MASK_NOC_NCR BIT(13) #define XPM_EVENT_ERROR_MASK_NOC_CR BIT(12) +enum pm_module_id { + PM_MODULE_ID = 0x0, + XSEM_MODULE_ID = 0x3, + TF_A_MODULE_ID = 0xa, +}; + enum pm_api_cb_id { PM_INIT_SUSPEND_CB = 30, PM_ACKNOWLEDGE_CB = 31, @@ -119,6 +129,7 @@ enum pm_api_cb_id { }; enum pm_api_id { + PM_API_FEATURES = 0, PM_GET_API_VERSION = 1, PM_REGISTER_NOTIFIER = 5, PM_FORCE_POWERDOWN = 8, -- cgit v1.2.3 From 8c016c807a90535432543204dbbb032e4a709009 Mon Sep 17 00:00:00 2001 From: Jay Buddhabhatti Date: Wed, 29 Nov 2023 03:27:12 -0800 Subject: drivers: soc: xilinx: Fix error message on SGI registration failure Failure to register SGI for firmware event notification is non-fatal error when feature is not supported by other modules such as Xen and TF-A. Add _info level log message for such special case. Also add XST_PM_INVALID_VERSION error code and map it to -EOPNOSUPP Linux kernel error code. If feature is not supported or EEMI API version is mismatch, firmware can return XST_PM_INVALID_VERSION = 4 or XST_PM_NO_FEATURE = 19 error code. Co-developed-by: Tanmay Shah Signed-off-by: Tanmay Shah Signed-off-by: Jay Buddhabhatti Link: https://lore.kernel.org/r/20231129112713.22718-5-jay.buddhabhatti@amd.com Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware/xlnx-zynqmp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index e9a7fece5efef..ec1800c860069 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -172,6 +172,7 @@ enum pm_api_id { /* PMU-FW return status codes */ enum pm_ret_status { XST_PM_SUCCESS = 0, + XST_PM_INVALID_VERSION = 4, XST_PM_NO_FEATURE = 19, XST_PM_INTERNAL = 2000, XST_PM_CONFLICT = 2001, -- cgit v1.2.3 From 5dac2a98f6542ae1ce78b702374ea4be3f5ee07d Mon Sep 17 00:00:00 2001 From: Jay Buddhabhatti Date: Wed, 29 Nov 2023 03:27:13 -0800 Subject: firmware: zynqmp: Add support to handle IPI CRC failure Added new PM error code XST_PM_INVALID_CRC to handle CRC validation failure during IPI communication. Co-developed-by: Naman Trivedi Manojbhai Signed-off-by: Naman Trivedi Manojbhai Signed-off-by: Jay Buddhabhatti Link: https://lore.kernel.org/r/20231129112713.22718-6-jay.buddhabhatti@amd.com Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware/xlnx-zynqmp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index ec1800c860069..6b48294f3c923 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -174,6 +174,7 @@ enum pm_ret_status { XST_PM_SUCCESS = 0, XST_PM_INVALID_VERSION = 4, XST_PM_NO_FEATURE = 19, + XST_PM_INVALID_CRC = 301, XST_PM_INTERNAL = 2000, XST_PM_CONFLICT = 2001, XST_PM_NO_ACCESS = 2002, -- cgit v1.2.3 From 5ec42bf04d72fd6d0a6855810cc779e0ee31dfd7 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 4 Dec 2023 15:27:06 -0600 Subject: PCI: add INTEL_HDA_ARL to pci_ids.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PCI ID insertion follows the increasing order in the table, but this hardware follows MTL (MeteorLake). Signed-off-by: Pierre-Louis Bossart Reviewed-by: Péter Ujfalusi Reviewed-by: Kai Vehmanen Acked-by: Mark Brown Link: https://lore.kernel.org/r/20231204212710.185976-2-pierre-louis.bossart@linux.intel.com Signed-off-by: Takashi Iwai --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 275799b5f535c..97cc0baad0f4b 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -3065,6 +3065,7 @@ #define PCI_DEVICE_ID_INTEL_82443GX_0 0x71a0 #define PCI_DEVICE_ID_INTEL_82443GX_2 0x71a2 #define PCI_DEVICE_ID_INTEL_82372FB_1 0x7601 +#define PCI_DEVICE_ID_INTEL_HDA_ARL 0x7728 #define PCI_DEVICE_ID_INTEL_HDA_RPL_S 0x7a50 #define PCI_DEVICE_ID_INTEL_HDA_ADL_S 0x7ad0 #define PCI_DEVICE_ID_INTEL_HDA_MTL 0x7e28 -- cgit v1.2.3 From fa422b353d212373fb2b2857a5ea5a6fa4876f9c Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Mon, 23 Oct 2023 15:20:46 +0800 Subject: mm, pmem, xfs: Introduce MF_MEM_PRE_REMOVE for unbind Now, if we suddenly remove a PMEM device(by calling unbind) which contains FSDAX while programs are still accessing data in this device, e.g.: ``` $FSSTRESS_PROG -d $SCRATCH_MNT -n 99999 -p 4 & # $FSX_PROG -N 1000000 -o 8192 -l 500000 $SCRATCH_MNT/t001 & echo "pfn1.1" > /sys/bus/nd/drivers/nd_pmem/unbind ``` it could come into an unacceptable state: 1. device has gone but mount point still exists, and umount will fail with "target is busy" 2. programs will hang and cannot be killed 3. may crash with NULL pointer dereference To fix this, we introduce a MF_MEM_PRE_REMOVE flag to let it know that we are going to remove the whole device, and make sure all related processes could be notified so that they could end up gracefully. This patch is inspired by Dan's "mm, dax, pmem: Introduce dev_pagemap_failure()"[1]. With the help of dax_holder and ->notify_failure() mechanism, the pmem driver is able to ask filesystem on it to unmap all files in use, and notify processes who are using those files. Call trace: trigger unbind -> unbind_store() -> ... (skip) -> devres_release_all() -> kill_dax() -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_PRE_REMOVE) -> xfs_dax_notify_failure() `-> freeze_super() // freeze (kernel call) `-> do xfs rmap ` -> mf_dax_kill_procs() ` -> collect_procs_fsdax() // all associated processes ` -> unmap_and_kill() ` -> invalidate_inode_pages2_range() // drop file's cache `-> thaw_super() // thaw (both kernel & user call) Introduce MF_MEM_PRE_REMOVE to let filesystem know this is a remove event. Use the exclusive freeze/thaw[2] to lock the filesystem to prevent new dax mapping from being created. Do not shutdown filesystem directly if configuration is not supported, or if failure range includes metadata area. Make sure all files and processes(not only the current progress) are handled correctly. Also drop the cache of associated files before pmem is removed. [1]: https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/ [2]: https://lore.kernel.org/linux-xfs/169116275623.3187159.16862410128731457358.stg-ugh@frogsfrogsfrogs/ Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Reviewed-by: Dan Williams Signed-off-by: Chandan Babu R --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 418d26608ece7..caf13e94260e3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3904,6 +3904,7 @@ enum mf_flags { MF_UNPOISON = 1 << 4, MF_SW_SIMULATED = 1 << 5, MF_NO_RETRY = 1 << 6, + MF_MEM_PRE_REMOVE = 1 << 7, }; int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, unsigned long count, int mf_flags); -- cgit v1.2.3 From 3bc05faf37876f99e2a7baffa9c66fdcfb11d1f7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 5 Dec 2023 17:42:30 +0100 Subject: net: dsa: microchip: properly support platform_data probing The ksz driver has bits and pieces of platform_data probing support, but it doesn't work. The conventional thing to do is to have an encapsulating structure for struct dsa_chip_data that gets put into dev->platform_data. This driver expects a struct ksz_platform_data, but that doesn't contain a struct dsa_chip_data as first element, which will obviously not work with dsa_switch_probe() -> dsa_switch_parse(). Pointing dev->platform_data to a struct dsa_chip_data directly is in principle possible, but that doesn't work either. The driver has ksz_switch_detect() to read the device ID from hardware, followed by ksz_check_device_id() to compare it against a predetermined expected value. This protects against early errors in the SPI/I2C communication. With platform_data, the mechanism in ksz_check_device_id() doesn't work and even leads to NULL pointer dereferences, since of_device_get_match_data() doesn't work in that probe path. So obviously, the platform_data support is actually missing, and the existing handling of struct ksz_platform_data is bogus. Complete the support by adding a struct dsa_chip_data as first element, and fixing up ksz_check_device_id() to pick up the platform_data instead of the unavailable of_device_get_match_data(). The early dev->chip_id assignment from ksz_switch_register() is also bogus, because ksz_switch_detect() sets it to an initial value. So remove it. Also, ksz_platform_data :: enabled_ports isn't used anywhere, delete it. Link: https://lore.kernel.org/netdev/20231204154315.3906267-1-dd@embedd.com/ Signed-off-by: Vladimir Oltean Signed-off-by: Daniel Danzberger Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/platform_data/microchip-ksz.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/microchip-ksz.h b/include/linux/platform_data/microchip-ksz.h index ea1cc6d829e98..6480bf4af0fb8 100644 --- a/include/linux/platform_data/microchip-ksz.h +++ b/include/linux/platform_data/microchip-ksz.h @@ -20,10 +20,12 @@ #define __MICROCHIP_KSZ_H #include +#include struct ksz_platform_data { + /* Must be first such that dsa_register_switch() can access it */ + struct dsa_chip_data cd; u32 chip_id; - u16 enabled_ports; }; #endif -- cgit v1.2.3 From d16f1096b320d42e41ad9dee4d4098afd140d3e1 Mon Sep 17 00:00:00 2001 From: Daniel Danzberger Date: Tue, 5 Dec 2023 17:42:31 +0100 Subject: net: dsa: microchip: move ksz_chip_id enum to platform include With the ksz_chip_id enums moved to the platform include file for ksz switches, platform code that instantiates a device can now use these to set ksz_platform_data::chip_id. Signed-off-by: Daniel Danzberger Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/platform_data/microchip-ksz.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/microchip-ksz.h b/include/linux/platform_data/microchip-ksz.h index 6480bf4af0fb8..f177416635a22 100644 --- a/include/linux/platform_data/microchip-ksz.h +++ b/include/linux/platform_data/microchip-ksz.h @@ -22,6 +22,25 @@ #include #include +enum ksz_chip_id { + KSZ8563_CHIP_ID = 0x8563, + KSZ8795_CHIP_ID = 0x8795, + KSZ8794_CHIP_ID = 0x8794, + KSZ8765_CHIP_ID = 0x8765, + KSZ8830_CHIP_ID = 0x8830, + KSZ9477_CHIP_ID = 0x00947700, + KSZ9896_CHIP_ID = 0x00989600, + KSZ9897_CHIP_ID = 0x00989700, + KSZ9893_CHIP_ID = 0x00989300, + KSZ9563_CHIP_ID = 0x00956300, + KSZ9567_CHIP_ID = 0x00956700, + LAN9370_CHIP_ID = 0x00937000, + LAN9371_CHIP_ID = 0x00937100, + LAN9372_CHIP_ID = 0x00937200, + LAN9373_CHIP_ID = 0x00937300, + LAN9374_CHIP_ID = 0x00937400, +}; + struct ksz_platform_data { /* Must be first such that dsa_register_switch() can access it */ struct dsa_chip_data cd; -- cgit v1.2.3 From 37c8ceb6d92c955f5dd8223c3f6c90b277322210 Mon Sep 17 00:00:00 2001 From: Avri Altman Date: Mon, 30 Oct 2023 08:22:26 +0200 Subject: mmc: core: Remove packed command leftovers Packed commands support was removed long time ago, but some bits got left behind. Remove them. Signed-off-by: Avri Altman Link: https://lore.kernel.org/r/20231030062226.1895692-1-avri.altman@wdc.com Signed-off-by: Ulf Hansson --- include/linux/mmc/card.h | 3 --- include/linux/mmc/core.h | 1 - include/linux/mmc/mmc.h | 10 ---------- 3 files changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 7b12eebc5586d..47eeb122524c4 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -52,9 +52,6 @@ struct mmc_ext_csd { u8 part_config; u8 cache_ctrl; u8 rst_n_function; - u8 max_packed_writes; - u8 max_packed_reads; - u8 packed_event_en; unsigned int part_time; /* Units: ms */ unsigned int sa_timeout; /* Units: 100ns */ unsigned int generic_cmd6_time; /* Units: 10ms */ diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index 6efec0b9820c1..2c7928a509071 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -27,7 +27,6 @@ struct mmc_command { u32 opcode; u32 arg; #define MMC_CMD23_ARG_REL_WR (1 << 31) -#define MMC_CMD23_ARG_PACKED ((0 << 31) | (1 << 30)) #define MMC_CMD23_ARG_TAG_REQ (1 << 29) u32 resp[4]; unsigned int flags; /* expected response type */ diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h index 6f7993803ee78..cf2bcb5da30de 100644 --- a/include/linux/mmc/mmc.h +++ b/include/linux/mmc/mmc.h @@ -257,8 +257,6 @@ static inline bool mmc_ready_for_data(u32 status) #define EXT_CSD_FLUSH_CACHE 32 /* W */ #define EXT_CSD_CACHE_CTRL 33 /* R/W */ #define EXT_CSD_POWER_OFF_NOTIFICATION 34 /* R/W */ -#define EXT_CSD_PACKED_FAILURE_INDEX 35 /* RO */ -#define EXT_CSD_PACKED_CMD_STATUS 36 /* RO */ #define EXT_CSD_EXP_EVENTS_STATUS 54 /* RO, 2 bytes */ #define EXT_CSD_EXP_EVENTS_CTRL 56 /* R/W, 2 bytes */ #define EXT_CSD_DATA_SECTOR_SIZE 61 /* R */ @@ -321,8 +319,6 @@ static inline bool mmc_ready_for_data(u32 status) #define EXT_CSD_SUPPORTED_MODE 493 /* RO */ #define EXT_CSD_TAG_UNIT_SIZE 498 /* RO */ #define EXT_CSD_DATA_TAG_SUPPORT 499 /* RO */ -#define EXT_CSD_MAX_PACKED_WRITES 500 /* RO */ -#define EXT_CSD_MAX_PACKED_READS 501 /* RO */ #define EXT_CSD_BKOPS_SUPPORT 502 /* RO */ #define EXT_CSD_HPI_FEATURES 503 /* RO */ @@ -402,18 +398,12 @@ static inline bool mmc_ready_for_data(u32 status) #define EXT_CSD_PWR_CL_8BIT_SHIFT 4 #define EXT_CSD_PWR_CL_4BIT_SHIFT 0 -#define EXT_CSD_PACKED_EVENT_EN BIT(3) - /* * EXCEPTION_EVENT_STATUS field */ #define EXT_CSD_URGENT_BKOPS BIT(0) #define EXT_CSD_DYNCAP_NEEDED BIT(1) #define EXT_CSD_SYSPOOL_EXHAUSTED BIT(2) -#define EXT_CSD_PACKED_FAILURE BIT(3) - -#define EXT_CSD_PACKED_GENERIC_ERROR BIT(0) -#define EXT_CSD_PACKED_INDEXED_ERROR BIT(1) /* * BKOPS status level -- cgit v1.2.3 From 9c0a5b3f9e55cf9a3dc85843666cae28adfdf7e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 4 Dec 2023 23:05:21 +0100 Subject: w1: gpio: Don't use platform data for driver data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit struct device's .platform_data isn't for drivers to write to. For driver-specific data there is .driver_data instead. As there is no in-tree platform that provides w1_gpio_platform_data, drop the include file and replace it by a local struct w1_gpio_ddata. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/8f7ebe03ddaa5a5c6e2b36fecdf59da7fc373527.1701727212.git.u.kleine-koenig@pengutronix.de Signed-off-by: Krzysztof Kozlowski --- include/linux/w1-gpio.h | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 include/linux/w1-gpio.h (limited to 'include/linux') diff --git a/include/linux/w1-gpio.h b/include/linux/w1-gpio.h deleted file mode 100644 index 3495fd0dc7900..0000000000000 --- a/include/linux/w1-gpio.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * w1-gpio interface to platform code - * - * Copyright (C) 2007 Ville Syrjala - */ -#ifndef _LINUX_W1_GPIO_H -#define _LINUX_W1_GPIO_H - -struct gpio_desc; - -/** - * struct w1_gpio_platform_data - Platform-dependent data for w1-gpio - */ -struct w1_gpio_platform_data { - struct gpio_desc *gpiod; - struct gpio_desc *pullup_gpiod; - void (*enable_external_pullup)(int enable); - unsigned int pullup_duration; -}; - -#endif /* _LINUX_W1_GPIO_H */ -- cgit v1.2.3 From 1fe13d83e2873b0aedeb5b9a299ca763bd37d75f Mon Sep 17 00:00:00 2001 From: Kaihua Zhong Date: Wed, 29 Nov 2023 09:55:26 +0800 Subject: mfd: Fix a few spelling mistakes in PMIC header file comments Fix four comment typos in MFD PMIC header files. Reported-by: k2ci Signed-off-by: Kaihua Zhong Reviewed-by: Randy Dunlap Link: https://lore.kernel.org/r/20231129015526.3302865-1-zhongkaihua@kylinos.cn Signed-off-by: Lee Jones --- include/linux/mfd/max77693-private.h | 2 +- include/linux/mfd/max77843-private.h | 2 +- include/linux/mfd/si476x-platform.h | 2 +- include/linux/mfd/tps65910.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h index 311f7d3d23236..54444ff2a5dea 100644 --- a/include/linux/mfd/max77693-private.h +++ b/include/linux/mfd/max77693-private.h @@ -405,7 +405,7 @@ enum max77693_haptic_reg { MAX77693_HAPTIC_REG_END, }; -/* max77693-pmic LSCNFG configuraton register */ +/* max77693-pmic LSCNFG configuration register */ #define MAX77693_PMIC_LOW_SYS_MASK 0x80 #define MAX77693_PMIC_LOW_SYS_SHIFT 7 diff --git a/include/linux/mfd/max77843-private.h b/include/linux/mfd/max77843-private.h index 0bc7454c4dbe3..2fb4db67f1104 100644 --- a/include/linux/mfd/max77843-private.h +++ b/include/linux/mfd/max77843-private.h @@ -198,7 +198,7 @@ enum max77843_irq_muic { #define MAX77843_MCONFIG_MEN_MASK BIT(MCONFIG_MEN_SHIFT) #define MAX77843_MCONFIG_PDIV_MASK (0x3 << MCONFIG_PDIV_SHIFT) -/* Max77843 charger insterrupts */ +/* Max77843 charger interrupts */ #define MAX77843_CHG_BYP_I BIT(0) #define MAX77843_CHG_BATP_I BIT(2) #define MAX77843_CHG_BAT_I BIT(3) diff --git a/include/linux/mfd/si476x-platform.h b/include/linux/mfd/si476x-platform.h index 18363b773d070..cb99e16ca9473 100644 --- a/include/linux/mfd/si476x-platform.h +++ b/include/linux/mfd/si476x-platform.h @@ -10,7 +10,7 @@ #ifndef __SI476X_PLATFORM_H__ #define __SI476X_PLATFORM_H__ -/* It is possible to select one of the four adresses using pins A0 +/* It is possible to select one of the four addresses using pins A0 * and A1 on SI476x */ #define SI476X_I2C_ADDR_1 0x60 #define SI476X_I2C_ADDR_2 0x61 diff --git a/include/linux/mfd/tps65910.h b/include/linux/mfd/tps65910.h index 701925db75b3f..f67ef0a4e041c 100644 --- a/include/linux/mfd/tps65910.h +++ b/include/linux/mfd/tps65910.h @@ -749,7 +749,7 @@ #define VDDCTRL_ST_SHIFT 0 -/*Register VDDCTRL_OP (0x28) bit definitios */ +/*Register VDDCTRL_OP (0x28) bit definitions */ #define VDDCTRL_OP_CMD_MASK 0x80 #define VDDCTRL_OP_CMD_SHIFT 7 #define VDDCTRL_OP_SEL_MASK 0x7F -- cgit v1.2.3 From 4d8ff6b0991d5e86b17b235fc46ec62e9195cb9b Mon Sep 17 00:00:00 2001 From: Amit Kumar Mahapatra Date: Sat, 25 Nov 2023 14:51:30 +0530 Subject: spi: Add multi-cs memories support in SPI core AMD-Xilinx GQSPI controller has two advanced mode that allows the controller to consider two flashes as one single device. One of these two mode is the parallel mode in which each byte of data is stored in both devices, the even bits in the lower flash & the odd bits in the upper flash. The byte split is automatically handled by the QSPI controller. The other mode is the stacked mode in which both the flashes share the same SPI bus but each of the device contain half of the data. In this mode, the controller does not follow CS requests but instead internally wires the two CS levels with the value of the most significant address bit. For supporting both these modes SPI core need to be updated for providing multiple CS for a single SPI device. For adding multi CS support the SPI device need to be aware of all the CS values. So, the "chip_select" member in the spi_device structure is now an array that holds all the CS values. spi_device structure now has a "cs_index_mask" member. This acts as an index to the chip_select array. If nth bit of spi->cs_index_mask is set then the driver would assert spi->chip_select[n]. In parallel mode all the chip selects are asserted/de-asserted simultaneously and each byte of data is stored in both devices, the even bits in one, the odd bits in the other. The split is automatically handled by the GQSPI controller. The GQSPI controller supports a maximum of two flashes connected in parallel mode. A SPI_CONTROLLER_MULTI_CS flag bit is added in the spi controller flags, through ctlr->flags the spi core will make sure that the controller is capable of handling multiple chip selects at once. For supporting multiple CS via GPIO the cs_gpiod member of the spi_device structure is now an array that holds the gpio descriptor for each chipselect. CS GPIO is not tested on our hardware, but it has been tested by @Stefan https://lore.kernel.org/all/005001da1efc$619ad5a0$24d080e0$@opensource.cirrus.com/ Signed-off-by: Amit Kumar Mahapatra Tested-by: Stefan Binding Link: https://lore.kernel.org/r/20231125092137.2948-4-amit.kumar-mahapatra@amd.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 51 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 255a0562aea5a..50622054b6af9 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -20,6 +20,9 @@ #include +/* Max no. of CS supported per spi device */ +#define SPI_CS_CNT_MAX 4 + struct dma_chan; struct software_node; struct ptp_system_timestamp; @@ -132,7 +135,8 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * @max_speed_hz: Maximum clock rate to be used with this chip * (on this board); may be changed by the device's driver. * The spi_transfer.speed_hz can override this for each transfer. - * @chip_select: Chipselect, distinguishing chips handled by @controller. + * @chip_select: Array of physical chipselect, spi->chipselect[i] gives + * the corresponding physical CS for logical CS i. * @mode: The spi mode defines how data is clocked out and in. * This may be changed by the device's driver. * The "active low" default for chipselect mode can be overridden @@ -157,8 +161,8 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * the device will bind to the named driver and only the named driver. * Do not set directly, because core frees it; use driver_set_override() to * set or clear it. - * @cs_gpiod: GPIO descriptor of the chipselect line (optional, NULL when - * not using a GPIO line) + * @cs_gpiod: Array of GPIO descriptors of the corresponding chipselect lines + * (optional, NULL when not using a GPIO line) * @word_delay: delay to be inserted between consecutive * words of a transfer * @cs_setup: delay to be introduced by the controller after CS is asserted @@ -167,6 +171,7 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * deasserted. If @cs_change_delay is used from @spi_transfer, then the * two delays will be added up. * @pcpu_statistics: statistics for the spi_device + * @cs_index_mask: Bit mask of the active chipselect(s) in the chipselect array * * A @spi_device is used to interchange data between an SPI slave * (usually a discrete chip) and CPU memory. @@ -182,7 +187,7 @@ struct spi_device { struct spi_controller *controller; struct spi_controller *master; /* Compatibility layer */ u32 max_speed_hz; - u8 chip_select; + u8 chip_select[SPI_CS_CNT_MAX]; u8 bits_per_word; bool rt; #define SPI_NO_TX BIT(31) /* No transmit wire */ @@ -213,7 +218,7 @@ struct spi_device { void *controller_data; char modalias[SPI_NAME_SIZE]; const char *driver_override; - struct gpio_desc *cs_gpiod; /* Chip select GPIO descriptor */ + struct gpio_desc *cs_gpiod[SPI_CS_CNT_MAX]; /* Chip select gpio desc */ struct spi_delay word_delay; /* Inter-word delay */ /* CS delays */ struct spi_delay cs_setup; @@ -223,6 +228,13 @@ struct spi_device { /* The statistics */ struct spi_statistics __percpu *pcpu_statistics; + /* Bit mask of the chipselect(s) that the driver need to use from + * the chipselect array.When the controller is capable to handle + * multiple chip selects & memories are connected in parallel + * then more than one bit need to be set in cs_index_mask. + */ + u32 cs_index_mask : SPI_CS_CNT_MAX; + /* * Likely need more hooks for more protocol options affecting how * the controller talks to each chip, like: @@ -279,22 +291,33 @@ static inline void *spi_get_drvdata(const struct spi_device *spi) static inline u8 spi_get_chipselect(const struct spi_device *spi, u8 idx) { - return spi->chip_select; + return spi->chip_select[idx]; } static inline void spi_set_chipselect(struct spi_device *spi, u8 idx, u8 chipselect) { - spi->chip_select = chipselect; + spi->chip_select[idx] = chipselect; } static inline struct gpio_desc *spi_get_csgpiod(const struct spi_device *spi, u8 idx) { - return spi->cs_gpiod; + return spi->cs_gpiod[idx]; } static inline void spi_set_csgpiod(struct spi_device *spi, u8 idx, struct gpio_desc *csgpiod) { - spi->cs_gpiod = csgpiod; + spi->cs_gpiod[idx] = csgpiod; +} + +static inline bool spi_is_csgpiod(struct spi_device *spi) +{ + u8 idx; + + for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + if (spi_get_csgpiod(spi, idx)) + return true; + } + return false; } /** @@ -399,6 +422,8 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * @bus_lock_spinlock: spinlock for SPI bus locking * @bus_lock_mutex: mutex for exclusion of multiple callers * @bus_lock_flag: indicates that the SPI bus is locked for exclusive use + * @multi_cs_cap: indicates that the SPI Controller can assert/de-assert + * more than one chip select at once. * @setup: updates the device mode and clocking records used by a * device's SPI controller; protocol code may call this. This * must fail if an unrecognized or unsupported mode is requested. @@ -567,6 +592,11 @@ struct spi_controller { #define SPI_CONTROLLER_MUST_TX BIT(4) /* Requires tx */ #define SPI_CONTROLLER_GPIO_SS BIT(5) /* GPIO CS must select slave */ #define SPI_CONTROLLER_SUSPENDED BIT(6) /* Currently suspended */ + /* + * The spi-controller has multi chip select capability and can + * assert/de-assert more than one chip select at once. + */ +#define SPI_CONTROLLER_MULTI_CS BIT(7) /* Flag indicating if the allocation of this struct is devres-managed */ bool devm_allocated; @@ -677,7 +707,8 @@ struct spi_controller { bool rt; bool auto_runtime_pm; bool cur_msg_mapped; - char last_cs; + char last_cs[SPI_CS_CNT_MAX]; + char last_cs_index_mask; bool last_cs_mode_high; bool fallback; struct completion xfer_completion; -- cgit v1.2.3 From a7fb0423c201ba12815877a0b5a68a6a1710b23a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 7 Dec 2023 08:46:14 -0500 Subject: cgroup: Move rcu_head up near the top of cgroup_root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit d23b5c577715 ("cgroup: Make operations on the cgroup root_list RCU safe") adds a new rcu_head to the cgroup_root structure and kvfree_rcu() for freeing the cgroup_root. The current implementation of kvfree_rcu(), however, has the limitation that the offset of the rcu_head structure within the larger data structure must be less than 4096 or the compilation will fail. See the macro definition of __is_kvfree_rcu_offset() in include/linux/rcupdate.h for more information. By putting rcu_head below the large cgroup structure, any change to the cgroup structure that makes it larger run the risk of causing build failure under certain configurations. Commit 77070eeb8821 ("cgroup: Avoid false cacheline sharing of read mostly rstat_cpu") happens to be the last straw that breaks it. Fix this problem by moving the rcu_head structure up before the cgroup structure. Fixes: d23b5c577715 ("cgroup: Make operations on the cgroup root_list RCU safe") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/lkml/20231207143806.114e0a74@canb.auug.org.au/ Signed-off-by: Waiman Long Acked-by: Yafang Shao Reviewed-by: Yosry Ahmed Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 5a97ea95b5649..ea48c861cd369 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -562,6 +562,10 @@ struct cgroup_root { /* Unique id for this hierarchy. */ int hierarchy_id; + /* A list running through the active hierarchies */ + struct list_head root_list; + struct rcu_head rcu; /* Must be near the top */ + /* * The root cgroup. The containing cgroup_root will be destroyed on its * release. cgrp->ancestors[0] will be used overflowing into the @@ -575,10 +579,6 @@ struct cgroup_root { /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; - /* A list running through the active hierarchies */ - struct list_head root_list; - struct rcu_head rcu; - /* Hierarchy-specific flags */ unsigned int flags; -- cgit v1.2.3 From f76f0d7f20672611974d3cc705996751fc403734 Mon Sep 17 00:00:00 2001 From: Wenkai Lin Date: Sat, 2 Dec 2023 17:17:18 +0800 Subject: crypto: hisilicon/qm - add a function to set qm algs Extract a public function to set qm algs and remove the similar code for setting qm algs in each module. Signed-off-by: Wenkai Lin Signed-off-by: Hao Fang Signed-off-by: Zhiqi Song Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index e3c0a1297b2c0..cdc979f66dba6 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -156,6 +156,11 @@ enum qm_cap_bits { QM_SUPPORT_RPM, }; +struct qm_dev_alg { + u64 alg_msk; + const char *alg; +}; + struct dfx_diff_registers { u32 *regs; u32 reg_offset; @@ -361,7 +366,6 @@ struct hisi_qm { struct work_struct rst_work; struct work_struct cmd_process; - const char *algs; bool use_sva; resource_size_t phys_base; @@ -559,6 +563,8 @@ void hisi_qm_regs_dump(struct seq_file *s, struct debugfs_regset32 *regset); u32 hisi_qm_get_hw_info(struct hisi_qm *qm, const struct hisi_qm_cap_info *info_table, u32 index, bool is_read); +int hisi_qm_set_algs(struct hisi_qm *qm, u64 alg_msk, const struct qm_dev_alg *dev_algs, + u32 dev_algs_size); /* Used by VFIO ACC live migration driver */ struct pci_driver *hisi_sec_get_pf_driver(void); -- cgit v1.2.3 From cabe13d0bd2efb8dd50ed2310f57b33e1a69a0d4 Mon Sep 17 00:00:00 2001 From: Zhiqi Song Date: Sat, 2 Dec 2023 17:17:19 +0800 Subject: crypto: hisilicon/qm - save capability registers in qm init process In previous capability register implementation, qm irq related values were read from capability registers dynamically when needed. But in abnormal scenario, e.g. the core is timeout and the device needs to soft reset and reset failed after disabling the MSE, the device can not be removed normally, causing the following call trace: | Call trace: | pci_irq_vector+0xfc/0x140 | hisi_qm_uninit+0x278/0x3b0 [hisi_qm] | hpre_remove+0x16c/0x1c0 [hisi_hpre] | pci_device_remove+0x6c/0x264 | device_release_driver_internal+0x1ec/0x3e0 | device_release_driver+0x3c/0x60 | pci_stop_bus_device+0xfc/0x22c | pci_stop_and_remove_bus_device+0x38/0x70 | pci_iov_remove_virtfn+0x108/0x1c0 | sriov_disable+0x7c/0x1e4 | pci_disable_sriov+0x4c/0x6c | hisi_qm_sriov_disable+0x90/0x160 [hisi_qm] | hpre_remove+0x1a8/0x1c0 [hisi_hpre] | pci_device_remove+0x6c/0x264 | device_release_driver_internal+0x1ec/0x3e0 | driver_detach+0x168/0x2d0 | bus_remove_driver+0xc0/0x230 | driver_unregister+0x58/0xdc | pci_unregister_driver+0x40/0x220 | hpre_exit+0x34/0x64 [hisi_hpre] | __arm64_sys_delete_module+0x374/0x620 [...] | Call trace: | free_msi_irqs+0x25c/0x300 | pci_disable_msi+0x19c/0x264 | pci_free_irq_vectors+0x4c/0x70 | hisi_qm_pci_uninit+0x44/0x90 [hisi_qm] | hisi_qm_uninit+0x28c/0x3b0 [hisi_qm] | hpre_remove+0x16c/0x1c0 [hisi_hpre] | pci_device_remove+0x6c/0x264 [...] The reason for this call trace is that when the MSE is disabled, the value of capability registers in the BAR space become invalid. This will make the subsequent unregister process get the wrong irq vector through capability registers and get the wrong irq number by pci_irq_vector(). So add a capability table structure to pre-store the valid value of the irq information capability register in qm init process, avoid obtaining invalid capability register value after the MSE is disabled. Fixes: 3536cc55cada ("crypto: hisilicon/qm - support get device irq information from hardware registers") Signed-off-by: Zhiqi Song Signed-off-by: Herbert Xu --- include/linux/hisi_acc_qm.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index cdc979f66dba6..5f4c74facf6a3 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -266,6 +266,16 @@ struct hisi_qm_cap_info { u32 v3_val; }; +struct hisi_qm_cap_record { + u32 type; + u32 cap_val; +}; + +struct hisi_qm_cap_tables { + struct hisi_qm_cap_record *qm_cap_table; + struct hisi_qm_cap_record *dev_cap_table; +}; + struct hisi_qm_list { struct mutex lock; struct list_head list; @@ -376,6 +386,8 @@ struct hisi_qm { u32 mb_qos; u32 type_rate; struct qm_err_isolate isolate_data; + + struct hisi_qm_cap_tables cap_tables; }; struct hisi_qp_status { -- cgit v1.2.3 From ee25fba76acd8324f9de6628872c8c612a684209 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 4 Dec 2023 10:35:00 +0100 Subject: gpiolib: provide gpiochip_dup_line_label() gpiochip_is_requested() not only has a misleading name but it returns a pointer to a string that is freed when the descriptor is released. Provide a new helper meant to replace it, which returns a copy of the label string instead. Signed-off-by: Bartosz Golaszewski Acked-by: Linus Walleij --- include/linux/gpio/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 0aed62f0c6330..5ac6dc30c5478 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -532,6 +532,7 @@ struct gpio_chip { }; const char *gpiochip_is_requested(struct gpio_chip *gc, unsigned int offset); +char *gpiochip_dup_line_label(struct gpio_chip *gc, unsigned int offset); /** * for_each_requested_gpio_in_range - iterates over requested GPIOs in a given range -- cgit v1.2.3 From 6fd9c9933475a3efd7eed2f80c7778908a560a1f Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 4 Dec 2023 10:35:08 +0100 Subject: gpiolib: use gpiochip_dup_line_label() in for_each helpers Rework for_each_requested_gpio_in_range() to use the new helper to retrieve a dynamically allocated copy of the descriptor label and free it at the end of each iteration. We need to leverage the CLASS()' destructor to make sure that the label is freed even when breaking out of the loop. Signed-off-by: Bartosz Golaszewski Acked-by: Linus Walleij --- include/linux/gpio/driver.h | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 5ac6dc30c5478..cae4cdaa87db3 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -534,17 +534,38 @@ struct gpio_chip { const char *gpiochip_is_requested(struct gpio_chip *gc, unsigned int offset); char *gpiochip_dup_line_label(struct gpio_chip *gc, unsigned int offset); + +struct _gpiochip_for_each_data { + const char **label; + unsigned int *i; +}; + +DEFINE_CLASS(_gpiochip_for_each_data, + struct _gpiochip_for_each_data, + if (*_T.label) kfree(*_T.label), + ({ + struct _gpiochip_for_each_data _data = { label, i }; + *_data.i = 0; + _data; + }), + const char **label, int *i) + /** * for_each_requested_gpio_in_range - iterates over requested GPIOs in a given range - * @chip: the chip to query - * @i: loop variable - * @base: first GPIO in the range - * @size: amount of GPIOs to check starting from @base - * @label: label of current GPIO + * @_chip: the chip to query + * @_i: loop variable + * @_base: first GPIO in the range + * @_size: amount of GPIOs to check starting from @base + * @_label: label of current GPIO */ -#define for_each_requested_gpio_in_range(chip, i, base, size, label) \ - for (i = 0; i < size; i++) \ - if ((label = gpiochip_is_requested(chip, base + i)) == NULL) {} else +#define for_each_requested_gpio_in_range(_chip, _i, _base, _size, _label) \ + for (CLASS(_gpiochip_for_each_data, _data)(&_label, &_i); \ + *_data.i < _size; \ + (*_data.i)++, kfree(*(_data.label)), *_data.label = NULL) \ + if ((*_data.label = \ + gpiochip_dup_line_label(_chip, _base + *_data.i)) == NULL) {} \ + else if (IS_ERR(*_data.label)) {} \ + else /* Iterates over all requested GPIO of the given @chip */ #define for_each_requested_gpio(chip, i, label) \ -- cgit v1.2.3 From f8d05e276b45e3097dfddd628fa991ce69c05c99 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 4 Dec 2023 10:35:09 +0100 Subject: gpiolib: remove gpiochip_is_requested() We have no external users of gpiochip_is_requested(). Let's remove it and replace its internal calls with direct testing of the REQUESTED flag. Signed-off-by: Bartosz Golaszewski Acked-by: Linus Walleij --- include/linux/gpio/driver.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index cae4cdaa87db3..d1a3cb061927f 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -531,7 +531,6 @@ struct gpio_chip { #endif /* CONFIG_OF_GPIO */ }; -const char *gpiochip_is_requested(struct gpio_chip *gc, unsigned int offset); char *gpiochip_dup_line_label(struct gpio_chip *gc, unsigned int offset); -- cgit v1.2.3 From 2a48c635fd9a48699805bbfeee1e4b94b8fe819d Mon Sep 17 00:00:00 2001 From: "justinstitt@google.com" Date: Wed, 6 Dec 2023 23:16:10 +0000 Subject: ethtool: Implement ethtool_puts() Use strscpy() to implement ethtool_puts(). Functionally the same as ethtool_sprintf() when it's used with two arguments or with just "%s" format specifier. Signed-off-by: Justin Stitt Reviewed-by: Przemek Kitszel Reviewed-by: Andrew Lunn Reviewed-by: Madhuri Sripada Signed-off-by: David S. Miller --- include/linux/ethtool.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index c2bb74143edab..deb683d3360f0 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1061,6 +1061,19 @@ int ethtool_get_ts_info_by_layer(struct net_device *dev, struct ethtool_ts_info */ extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...); +/** + * ethtool_puts - Write string to ethtool string data + * @data: Pointer to a pointer to the start of string to update + * @str: String to write + * + * Write string to *data without a trailing newline. Update *data + * to point at start of next string. + * + * Prefer this function to ethtool_sprintf() when given only + * two arguments or if @fmt is just "%s". + */ +extern void ethtool_puts(u8 **data, const char *str); + /* Link mode to forced speed capabilities maps */ struct ethtool_forced_speed_map { u32 speed; -- cgit v1.2.3 From 675bf8ef209cc8da28ffefd7d8a93c53735cc84a Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Thu, 30 Nov 2023 12:30:01 +0100 Subject: tty: virtio: drop virtio_cons_early_init() The last user of virtio_cons_early_init() was dropped in commit 7fb2b2d51244 ("s390/virtio: remove the old KVM virtio transport"). So now, drop virtio_cons_early_init() and the logic and headers behind too. Signed-off-by: Jiri Slaby (SUSE) Acked-by: Jason Wang Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Amit Shah Cc: Arnd Bergmann Cc: "Michael S. Tsirkin" Cc: Xuan Zhuo Cc: linux-alpha@vger.kernel.org Cc: virtualization@lists.linux.dev Link: https://lore.kernel.org/r/20231130113001.29154-1-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/virtio_console.h | 38 -------------------------------------- 1 file changed, 38 deletions(-) delete mode 100644 include/linux/virtio_console.h (limited to 'include/linux') diff --git a/include/linux/virtio_console.h b/include/linux/virtio_console.h deleted file mode 100644 index d2e2785af6026..0000000000000 --- a/include/linux/virtio_console.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so - * anyone can use the definitions to implement compatible drivers/servers: - * - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of IBM nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * Copyright (C) Red Hat, Inc., 2009, 2010, 2011 - * Copyright (C) Amit Shah , 2009, 2010, 2011 - */ -#ifndef _LINUX_VIRTIO_CONSOLE_H -#define _LINUX_VIRTIO_CONSOLE_H - -#include - -int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int)); -#endif /* _LINUX_VIRTIO_CONSOLE_H */ -- cgit v1.2.3 From 093258a9963bfac043244995bff87dc2c931b9b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Thu, 30 Nov 2023 15:07:13 +0100 Subject: tty: serial: amba: cleanup whitespace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix whitespace in include/linux/amba/serial.h to match current kernel coding standards. Fixes about: - CHECK: spaces preferred around that '|' (ctx:VxV) - ERROR: code indent should use tabs where possible - WARNING: Unnecessary space before function pointer arguments - WARNING: please, no spaces at the start of a line Reviewed-by: Linus Walleij Reviewed-by: Ilpo Järvinen Signed-off-by: Théo Lebrun Link: https://lore.kernel.org/r/20231130-mbly-uart-v5-1-6566703a04b5@bootlin.com Signed-off-by: Greg Kroah-Hartman --- include/linux/amba/serial.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h index a1307b58cc2c6..27003ec52114c 100644 --- a/include/linux/amba/serial.h +++ b/include/linux/amba/serial.h @@ -75,10 +75,10 @@ #define UART011_DR_PE (1 << 9) #define UART011_DR_FE (1 << 8) -#define UART01x_RSR_OE 0x08 -#define UART01x_RSR_BE 0x04 -#define UART01x_RSR_PE 0x02 -#define UART01x_RSR_FE 0x01 +#define UART01x_RSR_OE 0x08 +#define UART01x_RSR_BE 0x04 +#define UART01x_RSR_PE 0x02 +#define UART01x_RSR_FE 0x01 #define UART011_FR_RI 0x100 #define UART011_FR_TXFE 0x080 @@ -86,9 +86,9 @@ #define UART01x_FR_TXFF 0x020 #define UART01x_FR_RXFE 0x010 #define UART01x_FR_BUSY 0x008 -#define UART01x_FR_DCD 0x004 -#define UART01x_FR_DSR 0x002 -#define UART01x_FR_CTS 0x001 +#define UART01x_FR_DCD 0x004 +#define UART01x_FR_DSR 0x002 +#define UART01x_FR_CTS 0x001 #define UART01x_FR_TMSK (UART01x_FR_TXFF + UART01x_FR_BUSY) /* @@ -110,14 +110,14 @@ #define UART011_CR_TXE 0x0100 /* transmit enable */ #define UART011_CR_LBE 0x0080 /* loopback enable */ #define UART010_CR_RTIE 0x0040 -#define UART010_CR_TIE 0x0020 -#define UART010_CR_RIE 0x0010 +#define UART010_CR_TIE 0x0020 +#define UART010_CR_RIE 0x0010 #define UART010_CR_MSIE 0x0008 #define ST_UART011_CR_OVSFACT 0x0008 /* Oversampling factor */ #define UART01x_CR_IIRLP 0x0004 /* SIR low power mode */ #define UART01x_CR_SIREN 0x0002 /* SIR enable */ #define UART01x_CR_UARTEN 0x0001 /* UART enable */ - + #define UART011_LCRH_SPS 0x80 #define UART01x_LCRH_WLEN_8 0x60 #define UART01x_LCRH_WLEN_7 0x40 @@ -203,8 +203,8 @@ #define UART011_TXDMAE (1 << 1) /* enable transmit dma */ #define UART011_RXDMAE (1 << 0) /* enable receive dma */ -#define UART01x_RSR_ANY (UART01x_RSR_OE|UART01x_RSR_BE|UART01x_RSR_PE|UART01x_RSR_FE) -#define UART01x_FR_MODEM_ANY (UART01x_FR_DCD|UART01x_FR_DSR|UART01x_FR_CTS) +#define UART01x_RSR_ANY (UART01x_RSR_OE | UART01x_RSR_BE | UART01x_RSR_PE | UART01x_RSR_FE) +#define UART01x_FR_MODEM_ANY (UART01x_FR_DCD | UART01x_FR_DSR | UART01x_FR_CTS) #ifndef __ASSEMBLY__ struct amba_device; /* in uncompress this is included but amba/bus.h is not */ @@ -220,8 +220,8 @@ struct amba_pl011_data { bool dma_rx_poll_enable; unsigned int dma_rx_poll_rate; unsigned int dma_rx_poll_timeout; - void (*init) (void); - void (*exit) (void); + void (*init)(void); + void (*exit)(void); }; #endif -- cgit v1.2.3 From 49943393c9f0be61fd494a884851aa117cd72382 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 6 Dec 2023 08:36:48 +0100 Subject: tty: switch tty_port::xmit_* to u8 Both xmit_buf and xmit_fifo of struct tty_port should be u8. To conform to characters in the rest of the tty layer. Signed-off-by: "Jiri Slaby (SUSE)" Link: https://lore.kernel.org/r/20231206073712.17776-4-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_port.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h index 3276311a7f384..1b861f2100b69 100644 --- a/include/linux/tty_port.h +++ b/include/linux/tty_port.h @@ -114,8 +114,8 @@ struct tty_port { unsigned char console:1; struct mutex mutex; struct mutex buf_mutex; - unsigned char *xmit_buf; - DECLARE_KFIFO_PTR(xmit_fifo, unsigned char); + u8 *xmit_buf; + DECLARE_KFIFO_PTR(xmit_fifo, u8); unsigned int close_delay; unsigned int closing_wait; int drain_delay; -- cgit v1.2.3 From 3a00da027946cd08db1c1be2de4620950bbdf074 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 6 Dec 2023 08:36:49 +0100 Subject: tty: make tty_operations::send_xchar accept u8 char tty_operations::send_xchar is one of the last users of 'char' type for characters in the tty layer. Convert it to u8 now. Signed-off-by: "Jiri Slaby (SUSE)" Cc: Karsten Keil Cc: Ulf Hansson Cc: Marcel Holtmann Cc: Johan Hedberg Cc: Luiz Augusto von Dentz Cc: netdev@vger.kernel.org Cc: linux-mmc@vger.kernel.org Cc: linux-bluetooth@vger.kernel.org Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20231206073712.17776-5-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 2 +- include/linux/tty_driver.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index e96c85f4f91ed..d3bedcc08738a 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -410,7 +410,7 @@ void tty_wait_until_sent(struct tty_struct *tty, long timeout); void stop_tty(struct tty_struct *tty); void start_tty(struct tty_struct *tty); void tty_write_message(struct tty_struct *tty, char *msg); -int tty_send_xchar(struct tty_struct *tty, char ch); +int tty_send_xchar(struct tty_struct *tty, u8 ch); int tty_put_char(struct tty_struct *tty, unsigned char c); unsigned int tty_chars_in_buffer(struct tty_struct *tty); unsigned int tty_write_room(struct tty_struct *tty); diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index f428c1b784a20..7372124fbf90b 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -242,7 +242,7 @@ struct serial_struct; * Optional: If not provided, the device is assumed to have no FIFO. * Usually correct to invoke via tty_wait_until_sent(). May sleep. * - * @send_xchar: ``void ()(struct tty_struct *tty, char ch)`` + * @send_xchar: ``void ()(struct tty_struct *tty, u8 ch)`` * * This routine is used to send a high-priority XON/XOFF character (@ch) * to the @tty device. @@ -374,7 +374,7 @@ struct tty_operations { void (*flush_buffer)(struct tty_struct *tty); void (*set_ldisc)(struct tty_struct *tty); void (*wait_until_sent)(struct tty_struct *tty, int timeout); - void (*send_xchar)(struct tty_struct *tty, char ch); + void (*send_xchar)(struct tty_struct *tty, u8 ch); int (*tiocmget)(struct tty_struct *tty); int (*tiocmset)(struct tty_struct *tty, unsigned int set, unsigned int clear); -- cgit v1.2.3 From 4e8d8878145f1478886e1630c44113ad2c2eb99d Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 6 Dec 2023 08:36:50 +0100 Subject: tty: core: the rest to u8 There are still last minor users in the tty core that still reference characters by the 'char' type. Switch them to u8. Signed-off-by: "Jiri Slaby (SUSE)" Link: https://lore.kernel.org/r/20231206073712.17776-6-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index d3bedcc08738a..cc08f7e1c122b 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -242,7 +242,7 @@ struct tty_struct { void *driver_data; spinlock_t files_lock; int write_cnt; - unsigned char *write_buf; + u8 *write_buf; struct list_head tty_files; @@ -411,7 +411,7 @@ void stop_tty(struct tty_struct *tty); void start_tty(struct tty_struct *tty); void tty_write_message(struct tty_struct *tty, char *msg); int tty_send_xchar(struct tty_struct *tty, u8 ch); -int tty_put_char(struct tty_struct *tty, unsigned char c); +int tty_put_char(struct tty_struct *tty, u8 c); unsigned int tty_chars_in_buffer(struct tty_struct *tty); unsigned int tty_write_room(struct tty_struct *tty); void tty_driver_flush_buffer(struct tty_struct *tty); -- cgit v1.2.3 From f2470d2bc4327c2c1a604c6e247442dbb14c90c5 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 6 Dec 2023 08:37:07 +0100 Subject: tty: serdev: convert to u8 and size_t in serdev_controller_ops Switch character types to u8 and sizes to size_t. To conform to characters/sizes in the rest of the tty layer. In this patch, only struct serdev_controller_ops hooks. The rest will follow. Signed-off-by: "Jiri Slaby (SUSE)" Cc: Rob Herring Link: https://lore.kernel.org/r/20231206073712.17776-23-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/serdev.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serdev.h b/include/linux/serdev.h index 0ebf53bb254fa..8cdab2c3b6d58 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -82,7 +82,7 @@ enum serdev_parity { * serdev controller structures */ struct serdev_controller_ops { - int (*write_buf)(struct serdev_controller *, const unsigned char *, size_t); + ssize_t (*write_buf)(struct serdev_controller *, const u8 *, size_t); void (*write_flush)(struct serdev_controller *); int (*write_room)(struct serdev_controller *); int (*open)(struct serdev_controller *); @@ -185,9 +185,9 @@ static inline void serdev_controller_write_wakeup(struct serdev_controller *ctrl serdev->ops->write_wakeup(serdev); } -static inline int serdev_controller_receive_buf(struct serdev_controller *ctrl, - const unsigned char *data, - size_t count) +static inline ssize_t serdev_controller_receive_buf(struct serdev_controller *ctrl, + const u8 *data, + size_t count) { struct serdev_device *serdev = ctrl->serdev; -- cgit v1.2.3 From 475fc6e2de6fec0ff3c9a74ddcfd2b52c90adc0d Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Wed, 6 Dec 2023 08:37:08 +0100 Subject: tty: serdev: convert to u8 and size_t Switch character types to u8 and sizes to size_t. To conform to characters/sizes in the rest of the tty layer. This patch converts struct serdev_device_ops hooks and its instantiations. Signed-off-by: "Jiri Slaby (SUSE)" Cc: Rob Herring Acked-by: Johan Hovold Link: https://lore.kernel.org/r/20231206073712.17776-24-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/serdev.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serdev.h b/include/linux/serdev.h index 8cdab2c3b6d58..3fab88ba265ee 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -27,7 +27,7 @@ struct serdev_device; * not sleep. */ struct serdev_device_ops { - int (*receive_buf)(struct serdev_device *, const unsigned char *, size_t); + ssize_t (*receive_buf)(struct serdev_device *, const u8 *, size_t); void (*write_wakeup)(struct serdev_device *); }; @@ -204,13 +204,13 @@ void serdev_device_close(struct serdev_device *); int devm_serdev_device_open(struct device *, struct serdev_device *); unsigned int serdev_device_set_baudrate(struct serdev_device *, unsigned int); void serdev_device_set_flow_control(struct serdev_device *, bool); -int serdev_device_write_buf(struct serdev_device *, const unsigned char *, size_t); +int serdev_device_write_buf(struct serdev_device *, const u8 *, size_t); void serdev_device_wait_until_sent(struct serdev_device *, long); int serdev_device_get_tiocm(struct serdev_device *); int serdev_device_set_tiocm(struct serdev_device *, int, int); int serdev_device_break_ctl(struct serdev_device *serdev, int break_state); void serdev_device_write_wakeup(struct serdev_device *); -int serdev_device_write(struct serdev_device *, const unsigned char *, size_t, long); +ssize_t serdev_device_write(struct serdev_device *, const u8 *, size_t, long); void serdev_device_write_flush(struct serdev_device *); int serdev_device_write_room(struct serdev_device *); @@ -248,7 +248,7 @@ static inline unsigned int serdev_device_set_baudrate(struct serdev_device *sdev } static inline void serdev_device_set_flow_control(struct serdev_device *sdev, bool enable) {} static inline int serdev_device_write_buf(struct serdev_device *serdev, - const unsigned char *buf, + const u8 *buf, size_t count) { return -ENODEV; @@ -266,8 +266,9 @@ static inline int serdev_device_break_ctl(struct serdev_device *serdev, int brea { return -EOPNOTSUPP; } -static inline int serdev_device_write(struct serdev_device *sdev, const unsigned char *buf, - size_t count, unsigned long timeout) +static inline ssize_t serdev_device_write(struct serdev_device *sdev, + const u8 *buf, size_t count, + unsigned long timeout) { return -ENODEV; } -- cgit v1.2.3 From 8132d887a7023b212f242a51ae89281c69fde996 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 18 Oct 2023 12:11:56 -0400 Subject: KVM: remove CONFIG_HAVE_KVM_EVENTFD virt/kvm/eventfd.c is compiled unconditionally, meaning that the ioeventfds member of struct kvm is accessed unconditionally. CONFIG_HAVE_KVM_EVENTFD therefore must be defined for KVM common code to compile successfully, remove it. Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ea1523a7b83ad..3fe5a6be77688 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -782,7 +782,6 @@ struct kvm { struct list_head vm_list; struct mutex lock; struct kvm_io_bus __rcu *buses[KVM_NR_BUSES]; -#ifdef CONFIG_HAVE_KVM_EVENTFD struct { spinlock_t lock; struct list_head items; @@ -791,7 +790,6 @@ struct kvm { struct mutex resampler_lock; } irqfds; struct list_head ioeventfds; -#endif struct kvm_vm_stat stat; struct kvm_arch arch; refcount_t users_count; @@ -2056,8 +2054,6 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {} int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi); -#ifdef CONFIG_HAVE_KVM_EVENTFD - void kvm_eventfd_init(struct kvm *kvm); int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); @@ -2082,31 +2078,7 @@ static inline bool kvm_notify_irqfd_resampler(struct kvm *kvm, { return false; } -#endif - -#else - -static inline void kvm_eventfd_init(struct kvm *kvm) {} - -static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) -{ - return -EINVAL; -} - -static inline void kvm_irqfd_release(struct kvm *kvm) {} - -#ifdef CONFIG_HAVE_KVM_IRQCHIP -static inline void kvm_irq_routing_update(struct kvm *kvm) -{ -} -#endif - -static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) -{ - return -ENOSYS; -} - -#endif /* CONFIG_HAVE_KVM_EVENTFD */ +#endif /* CONFIG_HAVE_KVM_IRQFD */ void kvm_arch_irq_routing_update(struct kvm *kvm); -- cgit v1.2.3 From c5b31cc2371728ddefe9baf1d036aeb630a25d96 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 18 Oct 2023 12:07:32 -0400 Subject: KVM: remove CONFIG_HAVE_KVM_IRQFD All platforms with a kernel irqchip have support for irqfd. Unify the two configuration items so that userspace can expect to use irqfd to inject interrupts into the irqchip. Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3fe5a6be77688..1bba24a13ec93 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -805,8 +805,7 @@ struct kvm { * Update side is protected by irq_lock. */ struct kvm_irq_routing_table __rcu *irq_routing; -#endif -#ifdef CONFIG_HAVE_KVM_IRQFD + struct hlist_head irq_ack_notifier_list; #endif @@ -996,7 +995,7 @@ static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm) } #endif -#ifdef CONFIG_HAVE_KVM_IRQFD +#ifdef CONFIG_HAVE_KVM_IRQCHIP int kvm_irqfd_init(void); void kvm_irqfd_exit(void); #else @@ -2057,7 +2056,7 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi); void kvm_eventfd_init(struct kvm *kvm); int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); -#ifdef CONFIG_HAVE_KVM_IRQFD +#ifdef CONFIG_HAVE_KVM_IRQCHIP int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args); void kvm_irqfd_release(struct kvm *kvm); bool kvm_notify_irqfd_resampler(struct kvm *kvm, @@ -2078,7 +2077,7 @@ static inline bool kvm_notify_irqfd_resampler(struct kvm *kvm, { return false; } -#endif /* CONFIG_HAVE_KVM_IRQFD */ +#endif /* CONFIG_HAVE_KVM_IRQCHIP */ void kvm_arch_irq_routing_update(struct kvm *kvm); -- cgit v1.2.3 From 8ed26ab8d59111c2f7b86d200d1eb97d2a458fd1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 18 Oct 2023 12:18:00 -0400 Subject: KVM: clean up directives to compile out irqfds Keep all #ifdef CONFIG_HAVE_KVM_IRQCHIP parts of eventfd.c together, and compile out the irqfds field of struct kvm if the symbol is not defined. Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1bba24a13ec93..7e7fd25b09b3e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -782,6 +782,7 @@ struct kvm { struct list_head vm_list; struct mutex lock; struct kvm_io_bus __rcu *buses[KVM_NR_BUSES]; +#ifdef CONFIG_HAVE_KVM_IRQCHIP struct { spinlock_t lock; struct list_head items; @@ -789,6 +790,7 @@ struct kvm { struct list_head resampler_list; struct mutex resampler_lock; } irqfds; +#endif struct list_head ioeventfds; struct kvm_vm_stat stat; struct kvm_arch arch; -- cgit v1.2.3 From 92e1567ee3e3f6f160e320890ac77eec50bf8e7d Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Thu, 7 Dec 2023 22:25:17 -0500 Subject: bpf: Add some comments to stack representation Add comments to the datastructure tracking the stack state, as the mapping between each stack slot and where its state is stored is not entirely obvious. Signed-off-by: Andrei Matei Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20231208032519.260451-2-andreimatei1@gmail.com --- include/linux/bpf_verifier.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index bada59812e003..314b679fb4940 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -321,7 +321,17 @@ struct bpf_func_state { /* The following fields should be last. See copy_func_state() */ int acquired_refs; struct bpf_reference_state *refs; + /* The state of the stack. Each element of the array describes BPF_REG_SIZE + * (i.e. 8) bytes worth of stack memory. + * stack[0] represents bytes [*(r10-8)..*(r10-1)] + * stack[1] represents bytes [*(r10-16)..*(r10-9)] + * ... + * stack[allocated_stack/8 - 1] represents [*(r10-allocated_stack)..*(r10-allocated_stack+7)] + */ struct bpf_stack_state *stack; + /* Size of the current stack, in bytes. The stack state is tracked below, in + * `stack`. allocated_stack is always a multiple of BPF_REG_SIZE. + */ int allocated_stack; }; @@ -658,6 +668,10 @@ struct bpf_verifier_env { int exception_callback_subprog; bool explore_alu_limits; bool allow_ptr_leaks; + /* Allow access to uninitialized stack memory. Writes with fixed offset are + * always allowed, so this refers to reads (with fixed or variable offset), + * to writes with variable offset and to indirect (helper) accesses. + */ bool allow_uninit_stack; bool bpf_capable; bool bypass_spec_v1; -- cgit v1.2.3 From 5b20755b7780464fea3e54af0af744258dcc2841 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 26 Nov 2023 16:19:14 +0900 Subject: init: move THIS_MODULE from to Commit f50169324df4 ("module.h: split out the EXPORT_SYMBOL into export.h") appropriately separated EXPORT_SYMBOL into because modules and EXPORT_SYMBOL are orthogonal; modules are symbol consumers, while EXPORT_SYMBOL are used by symbol providers, which may not be necessarily a module. However, that commit also relocated THIS_MODULE. As explained in the commit description, the intention was to define THIS_MODULE in a lightweight header, but I do not believe was the best location because EXPORT_SYMBOL and THIS_MODULE are unrelated. Move it to another lightweight header, . The reason for choosing is to make self-contained without relying on incorrectly including . With this adjustment, the role of becomes clearer as it only defines EXPORT_SYMBOL. Signed-off-by: Masahiro Yamada Reviewed-by: Luis Chamberlain --- include/linux/export.h | 18 ------------------ include/linux/init.h | 7 +++++++ 2 files changed, 7 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/export.h b/include/linux/export.h index 9911508a9604f..0bbd02fd351db 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -6,15 +6,6 @@ #include #include -/* - * Export symbols from the kernel to modules. Forked from module.h - * to reduce the amount of pointless cruft we feed to gcc when only - * exporting a simple symbol or two. - * - * Try not to add #includes here. It slows compilation and makes kernel - * hackers place grumpy comments in header files. - */ - /* * This comment block is used by fixdep. Please do not remove. * @@ -23,15 +14,6 @@ * side effect of the *.o build rule. */ -#ifndef __ASSEMBLY__ -#ifdef MODULE -extern struct module __this_module; -#define THIS_MODULE (&__this_module) -#else -#define THIS_MODULE ((struct module *)0) -#endif -#endif /* __ASSEMBLY__ */ - #ifdef CONFIG_64BIT #define __EXPORT_SYMBOL_REF(sym) \ .balign 8 ASM_NL \ diff --git a/include/linux/init.h b/include/linux/init.h index 01b52c9c75268..3fa3f6241350b 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -179,6 +179,13 @@ extern void (*late_time_init)(void); extern bool initcall_debug; +#ifdef MODULE +extern struct module __this_module; +#define THIS_MODULE (&__this_module) +#else +#define THIS_MODULE ((struct module *)0) +#endif + #endif #ifndef MODULE -- cgit v1.2.3 From b73f08bb7fe5a0901646ca5ceaa1e7a2d5ee6293 Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Thu, 7 Dec 2023 13:39:27 +0100 Subject: iio: adc: ad9467: fix scale setting When reading in_voltage_scale we can get something like: root@analog:/sys/bus/iio/devices/iio:device2# cat in_voltage_scale 0.038146 However, when reading the available options: root@analog:/sys/bus/iio/devices/iio:device2# cat in_voltage_scale_available 2000.000000 2100.000006 2200.000007 2300.000008 2400.000009 2500.000010 which does not make sense. Moreover, when trying to set a new scale we get an error because there's no call to __ad9467_get_scale() to give us values as given when reading in_voltage_scale. Fix it by computing the available scales during probe and properly pass the list when .read_available() is called. While at it, change to use .read_available() from iio_info. Also note that to properly fix this, adi-axi-adc.c has to be changed accordingly. Fixes: ad6797120238 ("iio: adc: ad9467: add support AD9467 ADC") Signed-off-by: Nuno Sa Reviewed-by: David Lechner Link: https://lore.kernel.org/r/20231207-iio-backend-prep-v2-4-a4a33bc4d70e@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/adc/adi-axi-adc.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/adc/adi-axi-adc.h b/include/linux/iio/adc/adi-axi-adc.h index 52620e5b80522..b7904992d5619 100644 --- a/include/linux/iio/adc/adi-axi-adc.h +++ b/include/linux/iio/adc/adi-axi-adc.h @@ -41,6 +41,7 @@ struct adi_axi_adc_chip_info { * @reg_access IIO debugfs_reg_access hook for the client ADC * @read_raw IIO read_raw hook for the client ADC * @write_raw IIO write_raw hook for the client ADC + * @read_avail IIO read_avail hook for the client ADC */ struct adi_axi_adc_conv { const struct adi_axi_adc_chip_info *chip_info; @@ -54,6 +55,9 @@ struct adi_axi_adc_conv { int (*write_raw)(struct adi_axi_adc_conv *conv, struct iio_chan_spec const *chan, int val, int val2, long mask); + int (*read_avail)(struct adi_axi_adc_conv *conv, + struct iio_chan_spec const *chan, + const int **val, int *type, int *length, long mask); }; struct adi_axi_adc_conv *devm_adi_axi_adc_conv_register(struct device *dev, -- cgit v1.2.3 From 23e9f0138963ceef2a252d887534923a0502b2da Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 3 Nov 2023 11:14:50 +0800 Subject: mm/vmstat: move pgdemote_* to per-node stats Demotion will migrate pages across nodes. Previously, only the global demotion statistics were accounted for. Changed them to per-node statistics, making it easier to observe where demotion occurs on each node. This will help to identify which nodes are under pressure. This patch also make pgdemote_* behind CONFIG_NUMA_BALANCING, since demotion is not available for !CONFIG_NUMA_BALANCING With this patch, here is a sample where node0 node1 are DRAM, node3 is PMEM: Global stats: $ grep demote /proc/vmstat pgdemote_kswapd 254288 pgdemote_direct 113497 pgdemote_khugepaged 0 Per-node stats: $ grep demote /sys/devices/system/node/node0/vmstat # demotion source pgdemote_kswapd 68454 pgdemote_direct 83431 pgdemote_khugepaged 0 $ grep demote /sys/devices/system/node/node1/vmstat # demotion source pgdemote_kswapd 185834 pgdemote_direct 30066 pgdemote_khugepaged 0 $ grep demote /sys/devices/system/node/node3/vmstat # demotion target pgdemote_kswapd 0 pgdemote_direct 0 pgdemote_khugepaged 0 Link: https://lkml.kernel.org/r/20231103031450.1456523-1-lizhijian@fujitsu.com Signed-off-by: Li Zhijian Acked-by: "Huang, Ying" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 4 ++++ include/linux/vm_event_item.h | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3c25226beeed4..14faffa4354f5 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -206,6 +206,10 @@ enum node_stat_item { #ifdef CONFIG_NUMA_BALANCING PGPROMOTE_SUCCESS, /* promote successfully */ PGPROMOTE_CANDIDATE, /* candidate pages to promote */ + /* PGDEMOTE_*: pages demoted */ + PGDEMOTE_KSWAPD, + PGDEMOTE_DIRECT, + PGDEMOTE_KHUGEPAGED, #endif NR_VM_NODE_STAT_ITEMS }; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 8abfa12400400..d1b847502f09c 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -41,9 +41,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, PGSTEAL_KHUGEPAGED, - PGDEMOTE_KSWAPD, - PGDEMOTE_DIRECT, - PGDEMOTE_KHUGEPAGED, PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSCAN_KHUGEPAGED, -- cgit v1.2.3 From b2472efe4316b2687c153919c1513a098bd82c17 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:37 +0800 Subject: maple_tree: introduce {mtree,mas}_lock_nested() In some cases, nested locks may be needed, so {mtree,mas}_lock_nested is introduced. For example, when duplicating maple tree, we need to hold the locks of two trees, in which case nested locks are needed. At the same time, add the definition of spin_lock_nested() in tools for testing. Link: https://lkml.kernel.org/r/20231027033845.90608-3-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index d01e850b570fd..f91dbc7fe0911 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -256,6 +256,8 @@ struct maple_tree { struct maple_tree name = MTREE_INIT(name, 0) #define mtree_lock(mt) spin_lock((&(mt)->ma_lock)) +#define mtree_lock_nested(mas, subclass) \ + spin_lock_nested((&(mt)->ma_lock), subclass) #define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock)) /* @@ -406,6 +408,8 @@ struct ma_wr_state { }; #define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) +#define mas_lock_nested(mas, subclass) \ + spin_lock_nested(&((mas)->tree->ma_lock), subclass) #define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) -- cgit v1.2.3 From fd32e4e9b7646510ee9010e0d5f8b8857d48a6f7 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:38 +0800 Subject: maple_tree: introduce interfaces __mt_dup() and mtree_dup() Introduce interfaces __mt_dup() and mtree_dup(), which are used to duplicate a maple tree. They duplicate a maple tree in Depth-First Search (DFS) pre-order traversal. It uses memcopy() to copy nodes in the source tree and allocate new child nodes in non-leaf nodes. The new node is exactly the same as the source node except for all the addresses stored in it. It will be faster than traversing all elements in the source tree and inserting them one by one into the new tree. The time complexity of these two functions is O(n). The difference between __mt_dup() and mtree_dup() is that mtree_dup() handles locks internally. Analysis of the average time complexity of this algorithm: For simplicity, let's assume that the maximum branching factor of all non-leaf nodes is 16 (in allocation mode, it is 10), and the tree is a full tree. Under the given conditions, if there is a maple tree with n elements, the number of its leaves is n/16. From bottom to top, the number of nodes in each level is 1/16 of the number of nodes in the level below. So the total number of nodes in the entire tree is given by the sum of n/16 + n/16^2 + n/16^3 + ... + 1. This is a geometric series, and it has log(n) terms with base 16. According to the formula for the sum of a geometric series, the sum of this series can be calculated as (n-1)/15. Each node has only one parent node pointer, which can be considered as an edge. In total, there are (n-1)/15-1 edges. This algorithm consists of two operations: 1. Traversing all nodes in DFS order. 2. For each node, making a copy and performing necessary modifications to create a new node. For the first part, DFS traversal will visit each edge twice. Let T(ascend) represent the cost of taking one step downwards, and T(descend) represent the cost of taking one step upwards. And both of them are constants (although mas_ascend() may not be, as it contains a loop, but here we ignore it and treat it as a constant). So the time spent on the first part can be represented as ((n-1)/15-1) * (T(ascend) + T(descend)). For the second part, each node will be copied, and the cost of copying a node is denoted as T(copy_node). For each non-leaf node, it is necessary to reallocate all child nodes, and the cost of this operation is denoted as T(dup_alloc). The behavior behind memory allocation is complex and not specific to the maple tree operation. Here, we assume that the time required for a single allocation is constant. Since the size of a node is fixed, both of these symbols are also constants. We can calculate that the time spent on the second part is ((n-1)/15) * T(copy_node) + ((n-1)/15 - n/16) * T(dup_alloc). Adding both parts together, the total time spent by the algorithm can be represented as: ((n-1)/15) * (T(ascend) + T(descend) + T(copy_node) + T(dup_alloc)) - n/16 * T(dup_alloc) - (T(ascend) + T(descend)) Let C1 = T(ascend) + T(descend) + T(copy_node) + T(dup_alloc) Let C2 = T(dup_alloc) Let C3 = T(ascend) + T(descend) Finally, the expression can be simplified as: ((16 * C1 - 15 * C2) / (15 * 16)) * n - (C1 / 15 + C3). This is a linear function, so the average time complexity is O(n). Link: https://lkml.kernel.org/r/20231027033845.90608-4-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Suggested-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index f91dbc7fe0911..a452dd8a1e5c2 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -329,6 +329,9 @@ int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); void *mtree_erase(struct maple_tree *mt, unsigned long index); +int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); +int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); + void mtree_destroy(struct maple_tree *mt); void __mt_destroy(struct maple_tree *mt); -- cgit v1.2.3 From d2406291483775ecddaee929231a39c70c08fda2 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Fri, 27 Oct 2023 11:38:45 +0800 Subject: fork: use __mt_dup() to duplicate maple tree in dup_mmap() In dup_mmap(), using __mt_dup() to duplicate the old maple tree and then directly replacing the entries of VMAs in the new maple tree can result in better performance. __mt_dup() uses DFS pre-order to duplicate the maple tree, so it is efficient. The average time complexity of __mt_dup() is O(n), where n is the number of VMAs. The proof of the time complexity is provided in the commit log that introduces __mt_dup(). After duplicating the maple tree, each element is traversed and replaced (ignoring the cases of deletion, which are rare). Since it is only a replacement operation for each element, this process is also O(n). Analyzing the exact time complexity of the previous algorithm is challenging because each insertion can involve appending to a node, pushing data to adjacent nodes, or even splitting nodes. The frequency of each action is difficult to calculate. The worst-case scenario for a single insertion is when the tree undergoes splitting at every level. If we consider each insertion as the worst-case scenario, we can determine that the upper bound of the time complexity is O(n*log(n)), although this is a loose upper bound. However, based on the test data, it appears that the actual time complexity is likely to be O(n). As the entire maple tree is duplicated using __mt_dup(), if dup_mmap() fails, there will be a portion of VMAs that have not been duplicated in the maple tree. To handle this, we mark the failure point with XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, stop releasing VMAs that have not been duplicated after this point. There is a "spawn" in byte-unixbench[1], which can be used to test the performance of fork(). I modified it slightly to make it work with different number of VMAs. Below are the test results. The first row shows the number of VMAs. The second and third rows show the number of fork() calls per ten seconds, corresponding to next-20231006 and the this patchset, respectively. The test results were obtained with CPU binding to avoid scheduler load balancing that could cause unstable results. There are still some fluctuations in the test results, but at least they are better than the original performance. 21 121 221 421 821 1621 3221 6421 12821 25621 51221 112100 76261 54227 34035 20195 11112 6017 3161 1606 802 393 114558 83067 65008 45824 28751 16072 8922 4747 2436 1233 599 2.19% 8.92% 19.88% 34.64% 42.37% 44.64% 48.28% 50.17% 51.68% 53.74% 52.42% [1] https://github.com/kdlucas/byte-unixbench/tree/master Link: https://lkml.kernel.org/r/20231027033845.90608-11-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Suggested-by: Liam R. Howlett Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: Jonathan Corbet Cc: Mateusz Guzik Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 418d26608ece7..64cd1ee4aaccd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -994,6 +994,17 @@ static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, return mas_expected_entries(&vmi->mas, count); } +static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, + unsigned long start, unsigned long end, gfp_t gfp) +{ + __mas_set_range(&vmi->mas, start, end - 1); + mas_store_gfp(&vmi->mas, NULL, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + /* Free any unused preallocations */ static inline void vma_iter_free(struct vma_iterator *vmi) { -- cgit v1.2.3 From ff6c3d81f2e86b63a3a530683f89ef393882782a Mon Sep 17 00:00:00 2001 From: Liam Ni Date: Thu, 26 Oct 2023 10:03:29 +0800 Subject: NUMA: optimize detection of memory with no node id assigned by firmware Sanity check that makes sure the nodes cover all memory loops over numa_meminfo to count the pages that have node id assigned by the firmware, then loops again over memblock.memory to find the total amount of memory and in the end checks that the difference between the total memory and memory that covered by nodes is less than some threshold. Worse, the loop over numa_meminfo calls __absent_pages_in_range() that also partially traverses memblock.memory. It's much simpler and more efficient to have a single traversal of memblock.memory that verifies that amount of memory not covered by nodes is less than a threshold. Introduce memblock_validate_numa_coverage() that does exactly that and use it instead of numa_meminfo_cover_memory(). Link: https://lkml.kernel.org/r/20231026020329.327329-1-zhiguangni01@gmail.com Signed-off-by: Liam Ni Reviewed-by: Mike Rapoport (IBM) Cc: Andy Lutomirski Cc: Bibo Mao Cc: Binbin Zhou Cc: Borislav Petkov Cc: Dave Hansen Cc: Feiyang Chen Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: WANG Xuerui Signed-off-by: Andrew Morton --- include/linux/memblock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index ae3bde302f704..b695f9e946dab 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -123,6 +123,7 @@ int memblock_physmem_add(phys_addr_t base, phys_addr_t size); void memblock_trim_memory(phys_addr_t align); bool memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size); +bool memblock_validate_numa_coverage(unsigned long threshold_bytes); int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); int memblock_mark_mirror(phys_addr_t base, phys_addr_t size); -- cgit v1.2.3 From a4fc4a0c45f2617c3aa8b693739de264e0c09909 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 7 Nov 2023 21:26:40 +0000 Subject: mm: add folio_zero_tail() and use it in ext4 Patch series "Add folio_zero_tail() and folio_fill_tail()". I'm trying to make it easier for filesystems with tailpacking / stuffing / inline data to use folios. The primary function here is folio_fill_tail(). You give it a pointer to memory where the data currently is, and it takes care of copying it into the folio at that offset. That works for gfs2 & iomap. Then There's Ext4. Rather than gin up some kind of specialist "Here's a two pointers to two blocks of memory" routine, just let it do its current thing, and let it call folio_zero_tail(), which is also called by folio_fill_tail(). Other filesystems can be converted later; these ones seemed like good examples as they're already partly or completely converted to folios. This patch (of 3): Instead of unmapping the folio after copying the data to it, then mapping it again to zero the tail, provide folio_zero_tail() to zero the tail of an already-mapped folio. [akpm@linux-foundation.org: fix kerneldoc argument ordering] Link: https://lkml.kernel.org/r/20231107212643.3490372-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231107212643.3490372-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andreas Gruenbacher Cc: Darrick J. Wong Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- include/linux/highmem.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index be20cff4ba737..5ebd5e4dfbf85 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -483,6 +483,44 @@ static inline void memcpy_to_folio(struct folio *folio, size_t offset, flush_dcache_folio(folio); } +/** + * folio_zero_tail - Zero the tail of a folio. + * @folio: The folio to zero. + * @offset: The byte offset in the folio to start zeroing at. + * @kaddr: The address the folio is currently mapped to. + * + * If you have already used kmap_local_folio() to map a folio, written + * some data to it and now need to zero the end of the folio (and flush + * the dcache), you can use this function. If you do not have the + * folio kmapped (eg the folio has been partially populated by DMA), + * use folio_zero_range() or folio_zero_segment() instead. + * + * Return: An address which can be passed to kunmap_local(). + */ +static inline __must_check void *folio_zero_tail(struct folio *folio, + size_t offset, void *kaddr) +{ + size_t len = folio_size(folio) - offset; + + if (folio_test_highmem(folio)) { + size_t max = PAGE_SIZE - offset_in_page(offset); + + while (len > max) { + memset(kaddr, 0, max); + kunmap_local(kaddr); + len -= max; + offset += max; + max = PAGE_SIZE; + kaddr = kmap_local_folio(folio, offset); + } + } + + memset(kaddr, 0, len); + flush_dcache_folio(folio); + + return kaddr; +} + /** * memcpy_from_file_folio - Copy some bytes from a file folio. * @to: The destination buffer. -- cgit v1.2.3 From 6eaa266b54660f6b3654ad8902b4f7027054f55a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 7 Nov 2023 21:26:41 +0000 Subject: mm: add folio_fill_tail() and use it in iomap The iomap code was limited to PAGE_SIZE bytes; generalise it to cover an arbitrary-sized folio, and move it to be a common helper. [akpm@linux-foundation.org: fix folio_fill_tail(), per Andreas Gruenbacher] Link: https://lkml.kernel.org/r/20231107212643.3490372-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andreas Gruenbacher Cc: Andreas Dilger Cc: Darrick J. Wong Cc: Theodore Ts'o Cc: Andreas Gruenbacher Signed-off-by: Andrew Morton --- include/linux/highmem.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 5ebd5e4dfbf85..451c1dff0e873 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -521,6 +521,44 @@ static inline __must_check void *folio_zero_tail(struct folio *folio, return kaddr; } +/** + * folio_fill_tail - Copy some data to a folio and pad with zeroes. + * @folio: The destination folio. + * @offset: The offset into @folio at which to start copying. + * @from: The data to copy. + * @len: How many bytes of data to copy. + * + * This function is most useful for filesystems which support inline data. + * When they want to copy data from the inode into the page cache, this + * function does everything for them. It supports large folios even on + * HIGHMEM configurations. + */ +static inline void folio_fill_tail(struct folio *folio, size_t offset, + const char *from, size_t len) +{ + char *to = kmap_local_folio(folio, offset); + + VM_BUG_ON(offset + len > folio_size(folio)); + + if (folio_test_highmem(folio)) { + size_t max = PAGE_SIZE - offset_in_page(offset); + + while (len > max) { + memcpy(to, from, max); + kunmap_local(to); + len -= max; + from += max; + offset += max; + max = PAGE_SIZE; + to = kmap_local_folio(folio, offset); + } + } + + memcpy(to, from, len); + to = folio_zero_tail(folio, offset + len, to + len); + kunmap_local(to); +} + /** * memcpy_from_file_folio - Copy some bytes from a file folio. * @to: The destination buffer. -- cgit v1.2.3 From c36f9d3d2c3e17f9eef1d2f47a63c91d51d55e87 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 8 Nov 2023 20:46:02 +0000 Subject: mm: remove test_set_page_writeback() Patch series "Make folio_start_writeback return void". Most of the folio flag-setting functions return void. folio_start_writeback is gratuitously different; the only two filesystems that do anything with the return value emit debug messages if it's already set, and we can (and should) do that internally without bothering the filesystem to do it. This patch (of 4): There are no more callers of this wrapper. Link: https://lkml.kernel.org/r/20231108204605.745109-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231108204605.745109-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Howells Cc: Steve French Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a88e64acebfea..a440062e93865 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -780,11 +780,6 @@ bool set_page_writeback(struct page *page); #define folio_start_writeback_keepwrite(folio) \ __folio_start_writeback(folio, true) -static inline bool test_set_page_writeback(struct page *page) -{ - return set_page_writeback(page); -} - static __always_inline bool folio_test_head(struct folio *folio) { return test_bit(PG_head, folio_flags(folio, FOLIO_PF_ANY)); -- cgit v1.2.3 From b5612c368648a7be52411b288d09593e5945d1aa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 8 Nov 2023 20:46:05 +0000 Subject: mm: return void from folio_start_writeback() and related functions Nobody now checks the return value from any of these functions, so add an assertion at the beginning of the function and return void. Link: https://lkml.kernel.org/r/20231108204605.745109-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Josef Bacik Cc: David Howells Cc: Steve French Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a440062e93865..735cddc13d20e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -772,8 +772,8 @@ static __always_inline void SetPageUptodate(struct page *page) CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) -bool __folio_start_writeback(struct folio *folio, bool keep_write); -bool set_page_writeback(struct page *page); +void __folio_start_writeback(struct folio *folio, bool keep_write); +void set_page_writeback(struct page *page); #define folio_start_writeback(folio) \ __folio_start_writeback(folio, false) -- cgit v1.2.3 From 16f5dfbc851b55b87101a20e181d4a14be3007d6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 9 Nov 2023 21:15:07 +0000 Subject: gfp: include __GFP_NOWARN in GFP_NOWAIT GFP_NOWAIT callers are always prepared for their allocations to fail because they fail so frequently. Forcing the callers to remember to add __GFP_NOWARN is just annoying and leads to an endless stream of patches for the places where we forgot to add it. We can now remove __GFP_NOWARN from all the callers which specify GFP_NOWAIT, but I'd rather wait a cycle and send patches to each maintainer instead of creating a big pile of merge conflicts. Link: https://lkml.kernel.org/r/20231109211507.2262419-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/gfp_types.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 6583a58670c57..ae994534a12aa 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -274,7 +274,8 @@ typedef unsigned int __bitwise gfp_t; * accounted to kmemcg. * * %GFP_NOWAIT is for kernel allocations that should not stall for direct - * reclaim, start physical IO or use any filesystem callback. + * reclaim, start physical IO or use any filesystem callback. It is very + * likely to fail to allocate memory, even for very small allocations. * * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages * that do not require the starting of any physical IO. @@ -325,7 +326,7 @@ typedef unsigned int __bitwise gfp_t; #define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) -#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) +#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM | __GFP_NOWARN) #define GFP_NOIO (__GFP_RECLAIM) #define GFP_NOFS (__GFP_RECLAIM | __GFP_IO) #define GFP_USER (__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL) -- cgit v1.2.3 From af7628d6ec196999175ecb3fdb38336489b0f88a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Nov 2023 16:14:47 +0000 Subject: fs: convert error_remove_page to error_remove_folio There were already assertions that we were not passing a tail page to error_remove_page(), so make the compiler enforce that by converting everything to pass and use a folio. Link: https://lkml.kernel.org/r/20231117161447.2461643-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- include/linux/fs.h | 2 +- include/linux/mm.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e3..31b2cf963db9c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -434,7 +434,7 @@ struct address_space_operations { bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb); - int (*error_remove_page)(struct address_space *, struct page *); + int (*error_remove_folio)(struct address_space *, struct folio *); /* swapfile support */ int (*swap_activate)(struct swap_info_struct *sis, struct file *file, diff --git a/include/linux/mm.h b/include/linux/mm.h index 64cd1ee4aaccd..13a0902717161 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2384,7 +2384,8 @@ extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); -int generic_error_remove_page(struct address_space *mapping, struct page *page); +int generic_error_remove_folio(struct address_space *mapping, + struct folio *folio); struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, unsigned long address, struct pt_regs *regs); -- cgit v1.2.3 From 022012dcf44209074af97b6ae531a10c08736b31 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:13 +0100 Subject: lib/stackdepot, kasan: add flags to __stack_depot_save and rename Change the bool can_alloc argument of __stack_depot_save to a u32 argument that accepts a set of flags. The following patch will add another flag to stack_depot_save_flags besides the existing STACK_DEPOT_FLAG_CAN_ALLOC. Also rename the function to stack_depot_save_flags, as __stack_depot_save is a cryptic name, Link: https://lkml.kernel.org/r/645fa15239621eebbd3a10331e5864b718839512.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index e58306783d8e7..0b262e14144e7 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -32,6 +32,17 @@ typedef u32 depot_stack_handle_t; */ #define STACK_DEPOT_EXTRA_BITS 5 +typedef u32 depot_flags_t; + +/* + * Flags that can be passed to stack_depot_save_flags(); see the comment next + * to its declaration for more details. + */ +#define STACK_DEPOT_FLAG_CAN_ALLOC ((depot_flags_t)0x0001) + +#define STACK_DEPOT_FLAGS_NUM 1 +#define STACK_DEPOT_FLAGS_MASK ((depot_flags_t)((1 << STACK_DEPOT_FLAGS_NUM) - 1)) + /* * Using stack depot requires its initialization, which can be done in 3 ways: * @@ -69,31 +80,34 @@ static inline int stack_depot_early_init(void) { return 0; } #endif /** - * __stack_depot_save - Save a stack trace to stack depot + * stack_depot_save_flags - Save a stack trace to stack depot * * @entries: Pointer to the stack trace * @nr_entries: Number of frames in the stack * @alloc_flags: Allocation GFP flags - * @can_alloc: Allocate stack pools (increased chance of failure if false) + * @depot_flags: Stack depot flags + * + * Saves a stack trace from @entries array of size @nr_entries. * - * Saves a stack trace from @entries array of size @nr_entries. If @can_alloc is - * %true, stack depot can replenish the stack pools in case no space is left - * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids - * any allocations and fails if no space is left to store the stack trace. + * If STACK_DEPOT_FLAG_CAN_ALLOC is set in @depot_flags, stack depot can + * replenish the stack pools in case no space is left (allocates using GFP + * flags of @alloc_flags). Otherwise, stack depot avoids any allocations and + * fails if no space is left to store the stack trace. * * If the provided stack trace comes from the interrupt context, only the part * up to the interrupt entry is saved. * - * Context: Any context, but setting @can_alloc to %false is required if + * Context: Any context, but setting STACK_DEPOT_FLAG_CAN_ALLOC is required if * alloc_pages() cannot be used from the current context. Currently * this is the case for contexts where neither %GFP_ATOMIC nor * %GFP_NOWAIT can be used (NMI, raw_spin_lock). * * Return: Handle of the stack struct stored in depot, 0 on failure */ -depot_stack_handle_t __stack_depot_save(unsigned long *entries, - unsigned int nr_entries, - gfp_t gfp_flags, bool can_alloc); +depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, + unsigned int nr_entries, + gfp_t gfp_flags, + depot_flags_t depot_flags); /** * stack_depot_save - Save a stack trace to stack depot @@ -103,7 +117,7 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, * @alloc_flags: Allocation GFP flags * * Context: Contexts where allocations via alloc_pages() are allowed. - * See __stack_depot_save() for more details. + * See stack_depot_save_flags() for more details. * * Return: Handle of the stack trace stored in depot, 0 on failure */ -- cgit v1.2.3 From 410b764f89f59cce858d94fc781b68c1f27a0ca9 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:14 +0100 Subject: lib/stackdepot: add refcount for records Add a reference counter for how many times a stack records has been added to stack depot. Add a new STACK_DEPOT_FLAG_GET flag to stack_depot_save_flags that instructs the stack depot to increment the refcount. Do not yet decrement the refcount; this is implemented in one of the following patches. Do not yet enable any users to use the flag to avoid overflowing the refcount. This is preparatory patch for implementing the eviction of stack records from the stack depot. Link: https://lkml.kernel.org/r/a3fc14a2359d019d2a008d4ff8b46a665371ffee.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 0b262e14144e7..611716702d732 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -39,8 +39,9 @@ typedef u32 depot_flags_t; * to its declaration for more details. */ #define STACK_DEPOT_FLAG_CAN_ALLOC ((depot_flags_t)0x0001) +#define STACK_DEPOT_FLAG_GET ((depot_flags_t)0x0002) -#define STACK_DEPOT_FLAGS_NUM 1 +#define STACK_DEPOT_FLAGS_NUM 2 #define STACK_DEPOT_FLAGS_MASK ((depot_flags_t)((1 << STACK_DEPOT_FLAGS_NUM) - 1)) /* @@ -94,6 +95,9 @@ static inline int stack_depot_early_init(void) { return 0; } * flags of @alloc_flags). Otherwise, stack depot avoids any allocations and * fails if no space is left to store the stack trace. * + * If STACK_DEPOT_FLAG_GET is set in @depot_flags, stack depot will increment + * the refcount on the saved stack trace if it already exists in stack depot. + * * If the provided stack trace comes from the interrupt context, only the part * up to the interrupt entry is saved. * @@ -116,8 +120,11 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, * @nr_entries: Number of frames in the stack * @alloc_flags: Allocation GFP flags * - * Context: Contexts where allocations via alloc_pages() are allowed. - * See stack_depot_save_flags() for more details. + * Does not increment the refcount on the saved stack trace; see + * stack_depot_save_flags() for more details. + * + * Context: Contexts where allocations via alloc_pages() are allowed; + * see stack_depot_save_flags() for more details. * * Return: Handle of the stack trace stored in depot, 0 on failure */ -- cgit v1.2.3 From 108be8def46e9422f5a5abc96b0ab8fb6b3fb344 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 20 Nov 2023 18:47:15 +0100 Subject: lib/stackdepot: allow users to evict stack traces Add stack_depot_put, a function that decrements the reference counter on a stack record and removes it from the stack depot once the counter reaches 0. Internally, when removing a stack record, the function unlinks it from the hash table bucket and returns to the freelist. With this change, the users of stack depot can call stack_depot_put when keeping a stack trace in the stack depot is not needed anymore. This allows avoiding polluting the stack depot with irrelevant stack traces and thus have more space to store the relevant ones before the stack depot reaches its capacity. Link: https://lkml.kernel.org/r/1d1ad5692ee43d4fc2b3fd9d221331d30b36123f.1700502145.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 611716702d732..a6796f1789138 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -97,6 +97,8 @@ static inline int stack_depot_early_init(void) { return 0; } * * If STACK_DEPOT_FLAG_GET is set in @depot_flags, stack depot will increment * the refcount on the saved stack trace if it already exists in stack depot. + * Users of this flag must also call stack_depot_put() when keeping the stack + * trace is no longer required to avoid overflowing the refcount. * * If the provided stack trace comes from the interrupt context, only the part * up to the interrupt entry is saved. @@ -162,6 +164,18 @@ void stack_depot_print(depot_stack_handle_t stack); int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces); +/** + * stack_depot_put - Drop a reference to a stack trace from stack depot + * + * @handle: Stack depot handle returned from stack_depot_save() + * + * The stack trace is evicted from stack depot once all references to it have + * been dropped (once the number of stack_depot_evict() calls matches the + * number of stack_depot_save_flags() calls with STACK_DEPOT_FLAG_GET set for + * this stack trace). + */ +void stack_depot_put(depot_stack_handle_t handle); + /** * stack_depot_set_extra_bits - Set extra bits in a stack depot handle * -- cgit v1.2.3 From 95a2ac937013cc3aaaea02abcdd167b96874548d Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Mon, 20 Nov 2023 15:53:54 +0100 Subject: mm: use vmem_altmap code without CONFIG_ZONE_DEVICE vmem_altmap_free() and vmem_altmap_offset() could be utlized without CONFIG_ZONE_DEVICE enabled. For example, mm/memory_hotplug.c:__add_pages() relies on that. The altmap is no longer restricted to ZONE_DEVICE handling, but instead depends on CONFIG_SPARSEMEM_VMEMMAP. When CONFIG_SPARSEMEM_VMEMMAP is disabled, these functions are defined as inline stubs, ensuring compatibility with configurations that do not use sparsemem vmemmap. Without it, lkp reported the following: ld: arch/x86/mm/init_64.o: in function `remove_pagetable': init_64.c:(.meminit.text+0xfc7): undefined reference to `vmem_altmap_free' Link: https://lkml.kernel.org/r/20231120145354.308999-4-sumanthk@linux.ibm.com Signed-off-by: Sumanth Korikkar Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311180545.VeyRXEDq-lkp@intel.com/ Reviewed-by: Gerald Schaefer Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Heiko Carstens Cc: Michal Hocko Cc: Oscar Salvador Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- include/linux/memremap.h | 12 ------------ include/linux/mm.h | 26 ++++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 1314d9c5f05b0..744c830f4b132 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -196,8 +196,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, struct dev_pagemap *pgmap); bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); -unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); -void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); unsigned long memremap_compat_align(void); #else static inline void *devm_memremap_pages(struct device *dev, @@ -228,16 +226,6 @@ static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn) return false; } -static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) -{ - return 0; -} - -static inline void vmem_altmap_free(struct vmem_altmap *altmap, - unsigned long nr_pfns) -{ -} - /* when memremap_pages() is disabled all archs can remap a single page */ static inline unsigned long memremap_compat_align(void) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 13a0902717161..a422cc123a2df 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3871,6 +3871,32 @@ void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap); #endif +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +{ + /* number of pfns from base where pfn_to_page() is valid */ + if (altmap) + return altmap->reserve + altmap->free; + return 0; +} + +static inline void vmem_altmap_free(struct vmem_altmap *altmap, + unsigned long nr_pfns) +{ + altmap->alloc -= nr_pfns; +} +#else +static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +{ + return 0; +} + +static inline void vmem_altmap_free(struct vmem_altmap *altmap, + unsigned long nr_pfns) +{ +} +#endif + #define VMEMMAP_RESERVE_NR 2 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap, -- cgit v1.2.3 From 38ca8a185389716e9f7566bce4bb0085f71da61d Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 21 Nov 2023 20:43:49 +0100 Subject: pgtable: fix s390 ptdesc field comments Patch series "minor ptdesc updates", v3. This patch (of 2): Since commit d08d4e7cd6bf ("s390/mm: use full 4KB page for 2KB PTE") there is no fragmented page tracking on s390. Fix the corresponding comments. Link: https://lkml.kernel.org/r/cover.1700594815.git.agordeev@linux.ibm.com Link: https://lkml.kernel.org/r/2eead241f3a45bed26c7911cf66bded1e35670b8.1700594815.git.agordeev@linux.ibm.com Signed-off-by: Alexander Gordeev Suggested-by: Heiko Carstens Cc: Gerald Schaefer Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 957ce38768b2a..fbec64036baa6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -401,11 +401,11 @@ FOLIO_MATCH(compound_head, _head_2a); * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. * @__page_mapping: Aliases with page->mapping. Unused for page tables. * @pt_mm: Used for x86 pgds. - * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 only. + * @pt_frag_refcount: For fragmented page table tracking. Powerpc only. * @_pt_pad_2: Padding to ensure proper alignment. * @ptl: Lock for the page table. * @__page_type: Same as page->page_type. Unused for page tables. - * @_refcount: Same as page refcount. Used for s390 page tables. + * @_refcount: Same as page refcount. * @pt_memcg_data: Memcg data. Tracked for page tables here. * * This struct overlays struct page for now. Do not modify without a good -- cgit v1.2.3 From f7dd74ac239aad5ef7575ea03c45fd7956e00285 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 21 Nov 2023 20:43:50 +0100 Subject: pgtable: rename ptdesc _refcount field to __page_refcount Rename ptdesc _refcount field to __page_refcount similar to the other unused page fields. Link: https://lkml.kernel.org/r/982bdc652ba79a606c3d01c905766e7e076b3315.1700594815.git.agordeev@linux.ibm.com Signed-off-by: Alexander Gordeev Suggested-by: Vishal Moola Cc: Gerald Schaefer Cc: Heiko Carstens Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index fbec64036baa6..ef18d2b253788 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -405,7 +405,7 @@ FOLIO_MATCH(compound_head, _head_2a); * @_pt_pad_2: Padding to ensure proper alignment. * @ptl: Lock for the page table. * @__page_type: Same as page->page_type. Unused for page tables. - * @_refcount: Same as page refcount. + * @__page_refcount: Same as page refcount. * @pt_memcg_data: Memcg data. Tracked for page tables here. * * This struct overlays struct page for now. Do not modify without a good @@ -438,7 +438,7 @@ struct ptdesc { #endif }; unsigned int __page_type; - atomic_t _refcount; + atomic_t __page_refcount; #ifdef CONFIG_MEMCG unsigned long pt_memcg_data; #endif @@ -452,7 +452,7 @@ TABLE_MATCH(compound_head, _pt_pad_1); TABLE_MATCH(mapping, __page_mapping); TABLE_MATCH(rcu_head, pt_rcu_head); TABLE_MATCH(page_type, __page_type); -TABLE_MATCH(_refcount, _refcount); +TABLE_MATCH(_refcount, __page_refcount); #ifdef CONFIG_MEMCG TABLE_MATCH(memcg_data, pt_memcg_data); #endif -- cgit v1.2.3 From 7679e14098c9c3c8118a7130d6e1e9cfe2565c04 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 23 Nov 2023 19:23:17 +0200 Subject: mm: list_lru: Update kernel documentation to follow the requirements kernel-doc is not happy about documentation in list_lru.h: list_lru.h:90: warning: Function parameter or member 'lru' not described in 'list_lru_add' list_lru.h:90: warning: Excess function parameter 'list_lru' description in 'list_lru_add' list_lru.h:90: warning: No description found for return value of 'list_lru_add' list_lru.h:103: warning: Function parameter or member 'lru' not described in 'list_lru_del' list_lru.h:103: warning: Excess function parameter 'list_lru' description in 'list_lru_del' list_lru.h:103: warning: No description found for return value of 'list_lru_del' list_lru.h:116: warning: No description found for return value of 'list_lru_count_one' list_lru.h:168: warning: No description found for return value of 'list_lru_walk_one' list_lru.h:185: warning: No description found for return value of 'list_lru_walk_one_irq' Fix the documentation accordingly. While at it, fix the references to the parameters in functions inside the long descriptions, on which the above script is not complaining (yet?). Link: https://lkml.kernel.org/r/20231123172320.2434780-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Rasmus Villemoes Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index b35968ee9fb50..db86ad78d428a 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -73,7 +73,7 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren /** * list_lru_add: add an element to the lru list's tail - * @list_lru: the lru pointer + * @lru: the lru pointer * @item: the item to be added. * * If the element is already part of a list, this function returns doing @@ -83,22 +83,22 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren * the caller organize itself in a way that elements can be in more than * one type of list, it is up to the caller to fully remove the item from * the previous list (with list_lru_del() for instance) before moving it - * to @list_lru + * to @lru. * - * Return value: true if the list was updated, false otherwise + * Return: true if the list was updated, false otherwise */ bool list_lru_add(struct list_lru *lru, struct list_head *item); /** * list_lru_del: delete an element to the lru list - * @list_lru: the lru pointer + * @lru: the lru pointer * @item: the item to be deleted. * - * This function works analogously as list_lru_add in terms of list + * This function works analogously as list_lru_add() in terms of list * manipulation. The comments about an element already pertaining to - * a list are also valid for list_lru_del. + * a list are also valid for list_lru_del(). * - * Return value: true if the list was updated, false otherwise + * Return: true if the list was updated, false otherwise */ bool list_lru_del(struct list_lru *lru, struct list_head *item); @@ -108,9 +108,11 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item); * @nid: the node id to count from. * @memcg: the cgroup to count from. * - * Always return a non-negative number, 0 for empty lists. There is no - * guarantee that the list is not updated while the count is being computed. - * Callers that want such a guarantee need to provide an outer lock. + * There is no guarantee that the list is not updated while the count is being + * computed. Callers that want such a guarantee need to provide an outer lock. + * + * Return: 0 for empty lists, otherwise the number of objects + * currently held by @lru. */ unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg); @@ -141,7 +143,7 @@ typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, struct list_lru_one *list, spinlock_t *lock, void *cb_arg); /** - * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items. + * list_lru_walk_one: walk a @lru, isolating and disposing freeable items. * @lru: the lru pointer. * @nid: the node id to scan from. * @memcg: the cgroup to scan from. @@ -150,24 +152,24 @@ typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, * @cb_arg: opaque type that will be passed to @isolate * @nr_to_walk: how many items to scan. * - * This function will scan all elements in a particular list_lru, calling the + * This function will scan all elements in a particular @lru, calling the * @isolate callback for each of those items, along with the current list * spinlock and a caller-provided opaque. The @isolate callback can choose to * drop the lock internally, but *must* return with the lock held. The callback - * will return an enum lru_status telling the list_lru infrastructure what to + * will return an enum lru_status telling the @lru infrastructure what to * do with the object being scanned. * - * Please note that nr_to_walk does not mean how many objects will be freed, + * Please note that @nr_to_walk does not mean how many objects will be freed, * just how many objects will be scanned. * - * Return value: the number of objects effectively removed from the LRU. + * Return: the number of objects effectively removed from the LRU. */ unsigned long list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); /** - * list_lru_walk_one_irq: walk a list_lru, isolating and disposing freeable items. + * list_lru_walk_one_irq: walk a @lru, isolating and disposing freeable items. * @lru: the lru pointer. * @nid: the node id to scan from. * @memcg: the cgroup to scan from. @@ -176,7 +178,7 @@ unsigned long list_lru_walk_one(struct list_lru *lru, * @cb_arg: opaque type that will be passed to @isolate * @nr_to_walk: how many items to scan. * - * Same as @list_lru_walk_one except that the spinlock is acquired with + * Same as list_lru_walk_one() except that the spinlock is acquired with * spin_lock_irq(). */ unsigned long list_lru_walk_one_irq(struct list_lru *lru, -- cgit v1.2.3 From 61a7a5e25fe79b6c43f1c49705a0294be113c4a5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 30 Oct 2023 16:57:10 +0100 Subject: introduce for_other_threads(p, t) Cosmetic, but imho it makes the usage look more clear and simple, the new helper doesn't require to initialize "t". After this change while_each_thread() has only 3 users, and it is only used in the do/while loops. Link: https://lkml.kernel.org/r/20231030155710.GA9095@redhat.com Signed-off-by: Oleg Nesterov Reviewed-by: Christian Brauner Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- include/linux/sched/signal.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3499c1a8b9295..41d6759d6a4ac 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -646,6 +646,9 @@ extern bool current_is_single_threaded(void); #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) +#define for_other_threads(p, t) \ + for (t = p; (t = next_thread(t)) != p; ) + #define __for_each_thread(signal, t) \ list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \ lockdep_is_held(&tasklist_lock)) -- cgit v1.2.3 From 0eb5085c38749f2a91e5bd8cbebb1ebf3398343c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 16 Nov 2023 14:36:38 +0100 Subject: arch: remove ARCH_TASK_STRUCT_ON_STACK IA-64 was the only architecture which selected ARCH_TASK_STRUCT_ON_STACK. IA-64 was removed with commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture"). Therefore remove support for ARCH_TASK_STRUCT_ON_STACK as well. Note: this also reveals a potential bug in powerpc code, which makes use of __init_task_data without selecting ARCH_TASK_STRUCT_ON_STACK which makes __init_task_data a no-op. This is broken since commit d11ed3ab3166 ("Expand INIT_TASK() in init/init_task.c and remove") from 2018 and needs to be addressed separately. Link: https://lkml.kernel.org/r/20231116133638.1636277-4-hca@linux.ibm.com Signed-off-by: Heiko Carstens Reviewed-by: Arnd Bergmann Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- include/linux/init_task.h | 7 ------- include/linux/sched.h | 2 -- 2 files changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 40fc5813cf932..bccb3f1f62621 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -37,13 +37,6 @@ extern struct cred init_cred; #define INIT_TASK_COMM "swapper" -/* Attach to the init_task data structure for proper alignment */ -#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK -#define __init_task_data __section(".data..init_task") -#else -#define __init_task_data /**/ -#endif - /* Attach to the thread_info data structure for proper alignment */ #define __init_thread_info __section(".data..init_thread_info") diff --git a/include/linux/sched.h b/include/linux/sched.h index 292c316972485..c2ecb2e060462 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1955,9 +1955,7 @@ extern void ia64_set_curr_task(int cpu, struct task_struct *p); void yield(void); union thread_union { -#ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK struct task_struct task; -#endif #ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; #endif -- cgit v1.2.3 From b454ec29225cda9ae85ed0a154f4228f1922c872 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 20 Nov 2023 16:16:49 +0100 Subject: kernel/signal.c: simplify force_sig_info_to_task(), kill recalc_sigpending_and_wake() The purpose of recalc_sigpending_and_wake() is not clear, it looks "obviously unneeded" because we are going to send the signal which can't be blocked or ignored. Add the comment to explain why we can't rely on send_signal_locked() and make this logic more simple/explicit. recalc_sigpending_and_wake() has no other users, it can die. In fact I think we don't even need signal_wake_up(), the target task must be either current or a TASK_TRACED child, otherwise the usage of siglock is not safe. But this needs another change. Link: https://lkml.kernel.org/r/20231120151649.GA15995@redhat.com Signed-off-by: Oleg Nesterov Cc: Eric Biederman Signed-off-by: Andrew Morton --- include/linux/sched/signal.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 41d6759d6a4ac..015c0e3a3e1d1 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -432,7 +432,6 @@ static inline bool fault_signal_pending(vm_fault_t fault_flags, * This is required every time the blocked sigset_t changes. * callers must hold sighand->siglock. */ -extern void recalc_sigpending_and_wake(struct task_struct *t); extern void recalc_sigpending(void); extern void calculate_sigpending(void); -- cgit v1.2.3 From 7acf164b259d9007264d9d8501da1023f140a3b4 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 15 Nov 2023 21:00:27 +0800 Subject: resource: add walk_system_ram_res_rev() This function, being a variant of walk_system_ram_res() introduced in commit 8c86e70acead ("resource: provide new functions to walk through resources"), walks through a list of all the resources of System RAM in reversed order, i.e., from higher to lower. It will be used in kexec_file code to load kernel, initrd etc when preparing kexec reboot. Link: https://lkml.kernel.org/r/ZVTA6z/06cLnWKUz@MiWiFi-R3L-srv Signed-off-by: AKASHI Takahiro Signed-off-by: Baoquan He Cc: Eric Biederman Signed-off-by: Andrew Morton --- include/linux/ioport.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 14f5cfabbbc86..db7fe25f33700 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -331,6 +331,9 @@ extern int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); extern int +walk_system_ram_res_rev(u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)); +extern int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); -- cgit v1.2.3 From 7973be94724464222ae0b1860a25be04ab7b0132 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 8 Dec 2023 18:52:38 +0200 Subject: clk: x86: lpss-atom: Drop unneeded 'extern' in the header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'extern' for the functions is not needed, drop it. Signed-off-by: Andy Shevchenko Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231208165238.3309058-1-andriy.shevchenko@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/clk-lpss.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/clk-lpss.h b/include/linux/platform_data/x86/clk-lpss.h index 41df326583f98..7f132029316a9 100644 --- a/include/linux/platform_data/x86/clk-lpss.h +++ b/include/linux/platform_data/x86/clk-lpss.h @@ -15,6 +15,6 @@ struct lpss_clk_data { struct clk *clk; }; -extern int lpss_atom_clk_init(void); +int lpss_atom_clk_init(void); #endif /* __CLK_LPSS_H */ -- cgit v1.2.3 From 72dd14d241e1c6e241fc5b265746c59f306c6aa3 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 4 Dec 2023 14:17:37 -0800 Subject: platform/x86/intel/tpmi: Modify external interface to get read/write state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modify the external interface tpmi_get_feature_status() to get read and write blocked instead of locked and disabled. Since auxiliary device is not created when disabled, no use of returning disabled state. Also locked state is not useful as feature driver can't use locked state in a meaningful way. Using read and write state, feature driver can decide which operations to restrict for that feature. Signed-off-by: Srinivas Pandruvada Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231204221740.3645130-3-srinivas.pandruvada@linux.intel.com Signed-off-by: Hans de Goede --- include/linux/intel_tpmi.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h index ee07393445f9f..4f89c5bd8663c 100644 --- a/include/linux/intel_tpmi.h +++ b/include/linux/intel_tpmi.h @@ -32,7 +32,6 @@ struct intel_tpmi_plat_info { struct intel_tpmi_plat_info *tpmi_get_platform_data(struct auxiliary_device *auxdev); struct resource *tpmi_get_resource_at_index(struct auxiliary_device *auxdev, int index); int tpmi_get_resource_count(struct auxiliary_device *auxdev); - -int tpmi_get_feature_status(struct auxiliary_device *auxdev, int feature_id, int *locked, - int *disabled); +int tpmi_get_feature_status(struct auxiliary_device *auxdev, int feature_id, bool *read_blocked, + bool *write_blocked); #endif -- cgit v1.2.3 From 046d7be6210e7f870e53eb38fd410237e9d1d88f Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 4 Dec 2023 14:17:38 -0800 Subject: platform/x86/intel/tpmi: Move TPMI ID definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move TPMI ID definitions to common include file. In this way other feature drivers don't have to redefine. Signed-off-by: Srinivas Pandruvada Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20231204221740.3645130-4-srinivas.pandruvada@linux.intel.com Signed-off-by: Hans de Goede --- include/linux/intel_tpmi.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h index 4f89c5bd8663c..a3529b962be6e 100644 --- a/include/linux/intel_tpmi.h +++ b/include/linux/intel_tpmi.h @@ -12,6 +12,19 @@ #define TPMI_MINOR_VERSION(val) FIELD_GET(GENMASK(4, 0), val) #define TPMI_MAJOR_VERSION(val) FIELD_GET(GENMASK(7, 5), val) +/* + * List of supported TMPI IDs. + * Some TMPI IDs are not used by Linux, so the numbers are not consecutive. + */ +enum intel_tpmi_id { + TPMI_ID_RAPL = 0, /* Running Average Power Limit */ + TPMI_ID_PEM = 1, /* Power and Perf excursion Monitor */ + TPMI_ID_UNCORE = 2, /* Uncore Frequency Scaling */ + TPMI_ID_SST = 5, /* Speed Select Technology */ + TPMI_CONTROL_ID = 0x80, /* Special ID for getting feature status */ + TPMI_INFO_ID = 0x81, /* Special ID for PCI BDF and Package ID information */ +}; + /** * struct intel_tpmi_plat_info - Platform information for a TPMI device instance * @package_id: CPU Package id -- cgit v1.2.3 From 6bb3703aa52c9b5bb9716cbeae7350247b675209 Mon Sep 17 00:00:00 2001 From: Masahisa Kojima Date: Tue, 7 Nov 2023 14:40:52 +0900 Subject: efi: expose efivar generic ops register function This is a preparation for supporting efivar operations provided by other than efi subsystem. Both register and unregister functions are exposed so that non-efi subsystem can revert the efi generic operation. Acked-by: Sumit Garg Co-developed-by: Ilias Apalodimas Signed-off-by: Ilias Apalodimas Signed-off-by: Masahisa Kojima Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 9cc5bf32f6f28..1b2f50efb98ca 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1348,4 +1348,7 @@ bool efi_config_table_is_usable(const efi_guid_t *guid, unsigned long table) umode_t efi_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n); +void efivars_generic_ops_register(void); +void efivars_generic_ops_unregister(void); + #endif /* _LINUX_EFI_H */ -- cgit v1.2.3 From 1f71f37fbbd065b3326d9b7d8bb5ae688cd653d0 Mon Sep 17 00:00:00 2001 From: Masahisa Kojima Date: Tue, 7 Nov 2023 14:40:53 +0900 Subject: efi: Add EFI_ACCESS_DENIED status code This commit adds the EFI_ACCESS_DENIED status code. Acked-by: Sumit Garg Co-developed-by: Ilias Apalodimas Signed-off-by: Ilias Apalodimas Signed-off-by: Masahisa Kojima Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 1b2f50efb98ca..3668aa204c478 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -40,6 +40,7 @@ struct screen_info; #define EFI_WRITE_PROTECTED ( 8 | (1UL << (BITS_PER_LONG-1))) #define EFI_OUT_OF_RESOURCES ( 9 | (1UL << (BITS_PER_LONG-1))) #define EFI_NOT_FOUND (14 | (1UL << (BITS_PER_LONG-1))) +#define EFI_ACCESS_DENIED (15 | (1UL << (BITS_PER_LONG-1))) #define EFI_TIMEOUT (18 | (1UL << (BITS_PER_LONG-1))) #define EFI_ABORTED (21 | (1UL << (BITS_PER_LONG-1))) #define EFI_SECURITY_VIOLATION (26 | (1UL << (BITS_PER_LONG-1))) -- cgit v1.2.3 From 94f7f6182c72ba642c1f20111681f9cc8621c95f Mon Sep 17 00:00:00 2001 From: Masahisa Kojima Date: Tue, 7 Nov 2023 14:40:55 +0900 Subject: efivarfs: automatically update super block flag efivar operation is updated when the tee_stmm_efi module is probed. tee_stmm_efi module supports SetVariable runtime service, but user needs to manually remount the efivarfs as RW to enable the write access if the previous efivar operation does not support SetVariable and efivarfs is mounted as read-only. This commit notifies the update of efivar operation to efivarfs subsystem, then drops SB_RDONLY flag if the efivar operation supports SetVariable. Signed-off-by: Masahisa Kojima [ardb: use per-superblock instance of the notifier block] Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 3668aa204c478..c74f47711f0bd 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1349,6 +1349,14 @@ bool efi_config_table_is_usable(const efi_guid_t *guid, unsigned long table) umode_t efi_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n); +/* + * efivar ops event type + */ +#define EFIVAR_OPS_RDONLY 0 +#define EFIVAR_OPS_RDWR 1 + +extern struct blocking_notifier_head efivar_ops_nh; + void efivars_generic_ops_register(void); void efivars_generic_ops_unregister(void); -- cgit v1.2.3 From 704af3a40747e395b67892127943e6ffd5e2b642 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Sun, 10 Dec 2023 21:24:43 +0100 Subject: platform/x86: wmi: Remove chardev interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The design of the WMI chardev interface is broken: - it assumes that WMI drivers are not instantiated twice - it offers next to no abstractions, the WMI driver gets a raw byte buffer - it is only used by a single driver, something which is unlikely to change Since the only user (dell-smbios-wmi) has been migrated to his own ioctl interface, remove it. Reviewed-by: Ilpo Järvinen Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20231210202443.646427-6-W_Armin@gmx.de Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/wmi.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 8a643c39fcced..50f7f1e4fd4f8 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -11,7 +11,6 @@ #include #include #include -#include /** * struct wmi_device - WMI device structure @@ -47,8 +46,6 @@ acpi_status wmidev_block_set(struct wmi_device *wdev, u8 instance, const struct u8 wmidev_instance_count(struct wmi_device *wdev); -extern int set_required_buffer_size(struct wmi_device *wdev, u64 length); - /** * struct wmi_driver - WMI driver structure * @driver: Driver model structure @@ -57,11 +54,8 @@ extern int set_required_buffer_size(struct wmi_device *wdev, u64 length); * @probe: Callback for device binding * @remove: Callback for device unbinding * @notify: Callback for receiving WMI events - * @filter_callback: Callback for filtering device IOCTLs * * This represents WMI drivers which handle WMI devices. - * @filter_callback is only necessary for drivers which - * want to set up a WMI IOCTL interface. */ struct wmi_driver { struct device_driver driver; @@ -71,8 +65,6 @@ struct wmi_driver { int (*probe)(struct wmi_device *wdev, const void *context); void (*remove)(struct wmi_device *wdev); void (*notify)(struct wmi_device *device, union acpi_object *data); - long (*filter_callback)(struct wmi_device *wdev, unsigned int cmd, - struct wmi_ioctl_buffer *arg); }; extern int __must_check __wmi_driver_register(struct wmi_driver *driver, -- cgit v1.2.3 From 58e82a62669da52e688f4a8b89922c1839bf1001 Mon Sep 17 00:00:00 2001 From: Ma Jun Date: Mon, 11 Dec 2023 18:06:23 +0800 Subject: platform/x86/amd: Add support for AMD ACPI based Wifi band RFI mitigation feature Due to electrical and mechanical constraints in certain platform designs there may be likely interference of relatively high-powered harmonics of the (G-)DDR memory clocks with local radio module frequency bands used by Wifi 6/6e/7. To mitigate this, AMD has introduced a mechanism that devices can use to notify active use of particular frequencies so that other devices can make relative internal adjustments as necessary to avoid this resonance. Co-developed-by: Evan Quan Signed-off-by: Evan Quan Signed-off-by: Ma Jun Reviewed-by: Mario Limonciello Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/acpi_amd_wbrf.h | 91 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 include/linux/acpi_amd_wbrf.h (limited to 'include/linux') diff --git a/include/linux/acpi_amd_wbrf.h b/include/linux/acpi_amd_wbrf.h new file mode 100644 index 0000000000000..898f31d536d4a --- /dev/null +++ b/include/linux/acpi_amd_wbrf.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Wifi Band Exclusion Interface (AMD ACPI Implementation) + * Copyright (C) 2023 Advanced Micro Devices + */ + +#ifndef _ACPI_AMD_WBRF_H +#define _ACPI_AMD_WBRF_H + +#include +#include + +/* The maximum number of frequency band ranges */ +#define MAX_NUM_OF_WBRF_RANGES 11 + +/* Record actions */ +#define WBRF_RECORD_ADD 0x0 +#define WBRF_RECORD_REMOVE 0x1 + +/** + * struct freq_band_range - Wifi frequency band range definition + * @start: start frequency point (in Hz) + * @end: end frequency point (in Hz) + */ +struct freq_band_range { + u64 start; + u64 end; +}; + +/** + * struct wbrf_ranges_in_out - wbrf ranges info + * @num_of_ranges: total number of band ranges in this struct + * @band_list: array of Wifi band ranges + */ +struct wbrf_ranges_in_out { + u64 num_of_ranges; + struct freq_band_range band_list[MAX_NUM_OF_WBRF_RANGES]; +}; + +/** + * enum wbrf_notifier_actions - wbrf notifier actions index + * @WBRF_CHANGED: there was some frequency band updates. The consumers + * should retrieve the latest active frequency bands. + */ +enum wbrf_notifier_actions { + WBRF_CHANGED, +}; + +#if IS_ENABLED(CONFIG_AMD_WBRF) +bool acpi_amd_wbrf_supported_producer(struct device *dev); +int acpi_amd_wbrf_add_remove(struct device *dev, uint8_t action, struct wbrf_ranges_in_out *in); +bool acpi_amd_wbrf_supported_consumer(struct device *dev); +int amd_wbrf_retrieve_freq_band(struct device *dev, struct wbrf_ranges_in_out *out); +int amd_wbrf_register_notifier(struct notifier_block *nb); +int amd_wbrf_unregister_notifier(struct notifier_block *nb); +#else +static inline +bool acpi_amd_wbrf_supported_consumer(struct device *dev) +{ + return false; +} + +static inline +int acpi_amd_wbrf_add_remove(struct device *dev, uint8_t action, struct wbrf_ranges_in_out *in) +{ + return -ENODEV; +} + +static inline +bool acpi_amd_wbrf_supported_producer(struct device *dev) +{ + return false; +} +static inline +int amd_wbrf_retrieve_freq_band(struct device *dev, struct wbrf_ranges_in_out *out) +{ + return -ENODEV; +} +static inline +int amd_wbrf_register_notifier(struct notifier_block *nb) +{ + return -ENODEV; +} +static inline +int amd_wbrf_unregister_notifier(struct notifier_block *nb) +{ + return -ENODEV; +} +#endif /* CONFIG_AMD_WBRF */ + +#endif /* _ACPI_AMD_WBRF_H */ -- cgit v1.2.3 From 2ffdd4773d98b1f7488f8e37bd881bbecec24d85 Mon Sep 17 00:00:00 2001 From: Hsiao Chien Sung Date: Tue, 24 Oct 2023 21:00:33 +0800 Subject: soc: mediatek: Support MT8188 VDOSYS1 Padding in mtk-mmsys - Add Padding components - Add Mutex module definitions for Padding Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Hsiao Chien Sung Signed-off-by: AngeloGioacchino Del Regno --- include/linux/soc/mediatek/mtk-mmsys.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk-mmsys.h b/include/linux/soc/mediatek/mtk-mmsys.h index 2475ef9147465..4885b065b849f 100644 --- a/include/linux/soc/mediatek/mtk-mmsys.h +++ b/include/linux/soc/mediatek/mtk-mmsys.h @@ -62,6 +62,14 @@ enum mtk_ddp_comp_id { DDP_COMPONENT_OVL_2L1, DDP_COMPONENT_OVL_2L2, DDP_COMPONENT_OVL1, + DDP_COMPONENT_PADDING0, + DDP_COMPONENT_PADDING1, + DDP_COMPONENT_PADDING2, + DDP_COMPONENT_PADDING3, + DDP_COMPONENT_PADDING4, + DDP_COMPONENT_PADDING5, + DDP_COMPONENT_PADDING6, + DDP_COMPONENT_PADDING7, DDP_COMPONENT_POSTMASK0, DDP_COMPONENT_PWM0, DDP_COMPONENT_PWM1, -- cgit v1.2.3 From d1c371035c8204112d84266e6bde7537f25448f7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sun, 10 Dec 2023 10:50:28 +0800 Subject: quota: convert dquot_claim_space_nodirty() to return void dquot_claim_space_nodirty() always return zero, let's convert it to return void, then, its caller can get rid of handling failure case. Signed-off-by: Chao Yu Signed-off-by: Jan Kara Message-Id: <20231210025028.3262900-1-chao@kernel.org> --- include/linux/quotaops.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 4fa4ef0a173a3..06cc8888199e8 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -74,7 +74,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags); int dquot_alloc_inode(struct inode *inode); -int dquot_claim_space_nodirty(struct inode *inode, qsize_t number); +void dquot_claim_space_nodirty(struct inode *inode, qsize_t number); void dquot_free_inode(struct inode *inode); void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number); @@ -257,10 +257,9 @@ static inline void __dquot_free_space(struct inode *inode, qsize_t number, inode_sub_bytes(inode, number); } -static inline int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) +static inline void dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { inode_add_bytes(inode, number); - return 0; } static inline int dquot_reclaim_space_nodirty(struct inode *inode, @@ -358,14 +357,10 @@ static inline int dquot_reserve_block(struct inode *inode, qsize_t nr) DQUOT_SPACE_WARN|DQUOT_SPACE_RESERVE); } -static inline int dquot_claim_block(struct inode *inode, qsize_t nr) +static inline void dquot_claim_block(struct inode *inode, qsize_t nr) { - int ret; - - ret = dquot_claim_space_nodirty(inode, nr << inode->i_blkbits); - if (!ret) - mark_inode_dirty_sync(inode); - return ret; + dquot_claim_space_nodirty(inode, nr << inode->i_blkbits); + mark_inode_dirty_sync(inode); } static inline void dquot_reclaim_block(struct inode *inode, qsize_t nr) -- cgit v1.2.3 From 2ebe81c814355d000fe49d9c4213983844dcb32b Mon Sep 17 00:00:00 2001 From: Aleksander Lobakin Date: Wed, 6 Dec 2023 21:59:19 +0100 Subject: net, xdp: Allow metadata > 32 32 bytes may be not enough for some custom metadata. Relax the restriction, allow metadata larger than 32 bytes and make __skb_metadata_differs() work with bigger lengths. Now size of metadata is only limited by the fact it is stored as u8 in skb_shared_info, so maximum possible value is 255. Size still has to be aligned to 4, so the actual upper limit becomes 252. Most driver implementations will offer less, none can offer more. Other important conditions, such as having enough space for xdp_frame building, are already checked in bpf_xdp_adjust_meta(). Signed-off-by: Aleksander Lobakin Signed-off-by: Larysa Zaremba Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/eb87653c-8ff8-447d-a7a1-25961f60518a@kernel.org Link: https://lore.kernel.org/bpf/20231206205919.404415-3-larysa.zaremba@intel.com --- include/linux/skbuff.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b370eb8d70f7f..df6ef42639d8b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4247,10 +4247,13 @@ static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, { const void *a = skb_metadata_end(skb_a); const void *b = skb_metadata_end(skb_b); - /* Using more efficient varaiant than plain call to memcmp(). */ -#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 diffs = 0; + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || + BITS_PER_LONG != 64) + goto slow; + + /* Using more efficient variant than plain call to memcmp(). */ switch (meta_len) { #define __it(x, op) (x -= sizeof(u##op)) #define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op)) @@ -4270,11 +4273,11 @@ static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, fallthrough; case 4: diffs |= __it_diff(a, b, 32); break; + default: +slow: + return memcmp(a - meta_len, b - meta_len, meta_len); } return diffs; -#else - return memcmp(a - meta_len, b - meta_len, meta_len); -#endif } static inline bool skb_metadata_differs(const struct sk_buff *skb_a, -- cgit v1.2.3 From 718ab8226636a1a3a7d281f5d6a7ad7c925efe5a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Tue, 28 Nov 2023 09:15:07 +0100 Subject: PCI/ASPM: Add pci_enable_link_state_locked() Add pci_enable_link_state_locked() for enabling link states that can be used in contexts where a pci_bus_sem read lock is already held (e.g. from pci_walk_bus()). This helper will be used to fix a couple of potential deadlocks where the current helper is called with the lock already held, hence the CC stable tag. Fixes: f492edb40b54 ("PCI: vmd: Add quirk to configure PCIe ASPM and LTR") Link: https://lore.kernel.org/r/20231128081512.19387-2-johan+linaro@kernel.org Signed-off-by: Johan Hovold [bhelgaas: include helper name in subject, commit log] Signed-off-by: Bjorn Helgaas Reviewed-by: Manivannan Sadhasivam Cc: # 6.3 Cc: Michael Bottini Cc: David E. Box --- include/linux/pci.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 60ca768bc8679..dea043bc1e383 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1829,6 +1829,7 @@ extern bool pcie_ports_native; int pci_disable_link_state(struct pci_dev *pdev, int state); int pci_disable_link_state_locked(struct pci_dev *pdev, int state); int pci_enable_link_state(struct pci_dev *pdev, int state); +int pci_enable_link_state_locked(struct pci_dev *pdev, int state); void pcie_no_aspm(void); bool pcie_aspm_support_enabled(void); bool pcie_aspm_enabled(struct pci_dev *pdev); @@ -1839,6 +1840,8 @@ static inline int pci_disable_link_state_locked(struct pci_dev *pdev, int state) { return 0; } static inline int pci_enable_link_state(struct pci_dev *pdev, int state) { return 0; } +static inline int pci_enable_link_state_locked(struct pci_dev *pdev, int state) +{ return 0; } static inline void pcie_no_aspm(void) { } static inline bool pcie_aspm_support_enabled(void) { return false; } static inline bool pcie_aspm_enabled(struct pci_dev *pdev) { return false; } -- cgit v1.2.3 From 46eae99ef73302f9fb3dddcd67c374b3dffe8fd6 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 25 Oct 2023 16:02:02 +0200 Subject: add statmount(2) syscall Add a way to query attributes of a single mount instead of having to parse the complete /proc/$PID/mountinfo, which might be huge. Lookup the mount the new 64bit mount ID. If a mount needs to be queried based on path, then statx(2) can be used to first query the mount ID belonging to the path. Design is based on a suggestion by Linus: "So I'd suggest something that is very much like "statfsat()", which gets a buffer and a length, and returns an extended "struct statfs" *AND* just a string description at the end." The interface closely mimics that of statx. Handle ASCII attributes by appending after the end of the structure (as per above suggestion). Pointers to strings are stored in u64 members to make the structure the same regardless of pointer size. Strings are nul terminated. Link: https://lore.kernel.org/all/CAHk-=wh5YifP7hzKSbwJj94+DZ2czjrZsczy6GBimiogZws=rg@mail.gmail.com/ Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20231025140205.3586473-5-mszeredi@redhat.com Reviewed-by: Ian Kent [Christian Brauner : various minor changes] Signed-off-by: Christian Brauner --- include/linux/syscalls.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index fd9d12de7e929..530ca9adf5f18 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -74,6 +74,8 @@ struct landlock_ruleset_attr; enum landlock_rule_type; struct cachestat_range; struct cachestat; +struct statmount; +struct mnt_id_req; #include #include @@ -407,6 +409,9 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz, asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user *buf); asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, struct statfs64 __user *buf); +asmlinkage long sys_statmount(const struct mnt_id_req __user *req, + struct statmount __user *buf, size_t bufsize, + unsigned int flags); asmlinkage long sys_truncate(const char __user *path, long length); asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length); #if BITS_PER_LONG == 32 -- cgit v1.2.3 From 5bc2ea60897e0f899fb93930dd867dae7c8eb11f Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Mon, 11 Dec 2023 20:27:47 +0800 Subject: iio: core: introduce trough info element for minimum values The IIO_CHAN_INFO_PEAK info element is used for maximum values and currently there is no equivalent for minimum values. Instead of overloading the existing peak info element, a new info element can be added. In principle there is no need to add a _TROUGH_SCALE element as the scale will be the same as the one required for INFO_PEAK, which in turn is sometimes omitted if a single scale for peaks and raw values is required. Add an IIO_CHAN_INFO_TROUGH info element for minimum values. Signed-off-by: Javier Carrasco Link: https://lore.kernel.org/r/20231211122747.9723-1-579lpy@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/iio/types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h index 117bde7d6ad79..d89982c98368c 100644 --- a/include/linux/iio/types.h +++ b/include/linux/iio/types.h @@ -68,6 +68,7 @@ enum iio_chan_info_enum { IIO_CHAN_INFO_THERMOCOUPLE_TYPE, IIO_CHAN_INFO_CALIBAMBIENT, IIO_CHAN_INFO_ZEROPOINT, + IIO_CHAN_INFO_TROUGH, }; #endif /* _IIO_TYPES_H_ */ -- cgit v1.2.3 From 4649620d9404d3aceb25891c24bab77143e3f21c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 8 Dec 2023 20:13:44 +0100 Subject: thermal: core: Make thermal_zone_device_unregister() return after freeing the zone Make thermal_zone_device_unregister() wait until all of the references to the given thermal zone object have been dropped and free it before returning. This guarantees that when thermal_zone_device_unregister() returns, there is no leftover activity regarding the thermal zone in question which is required by some of its callers (for instance, modular driver code that wants to know when it is safe to let the module go away). Subsequently, this will allow some confusing device_is_registered() checks to be dropped from the thermal sysfs and core code. Signed-off-by: Rafael J. Wysocki Reviewed-and-tested-by: Lukasz Luba Acked-by: Daniel Lezcano --- include/linux/thermal.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 0ea99f50d57c5..bedbaec9a42e1 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -117,6 +117,7 @@ struct thermal_cooling_device { * @id: unique id number for each thermal zone * @type: the thermal zone device type * @device: &struct device for this thermal zone + * @removal: removal completion * @trip_temp_attrs: attributes for trip points for sysfs: trip temperature * @trip_type_attrs: attributes for trip points for sysfs: trip type * @trip_hyst_attrs: attributes for trip points for sysfs: trip hysteresis @@ -156,6 +157,7 @@ struct thermal_zone_device { int id; char type[THERMAL_NAME_LENGTH]; struct device device; + struct completion removal; struct attribute_group trips_attribute_group; struct thermal_attr *trip_temp_attrs; struct thermal_attr *trip_type_attrs; -- cgit v1.2.3 From 4e58aaeebb3c27993c734c99eae6881b196b1ddb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 1 Nov 2023 18:28:38 -0700 Subject: rcu: Restrict access to RCU CPU stall notifiers Although the RCU CPU stall notifiers can be useful for dumping state when tracking down delicate forward-progress bugs where NUMA effects cause cache lines to be delivered to a given CPU regularly, but always in a state that prevents that CPU from making forward progress. These bugs can be detected by the RCU CPU stall-warning mechanism, but in some cases, the stall-warnings printk()s disrupt the forward-progress bug before any useful state can be obtained. Unfortunately, the notifier mechanism added by commit 5b404fdabacf ("rcu: Add RCU CPU stall notifier") can make matters worse if used at all carelessly. For example, if the stall warning was caused by a lock not being released, then any attempt to acquire that lock in the notifier will hang. This will prevent not only the notifier from producing any useful output, but it will also prevent the stall-warning message from ever appearing. This commit therefore hides this new RCU CPU stall notifier mechanism under a new RCU_CPU_STALL_NOTIFIER Kconfig option that depends on both DEBUG_KERNEL and RCU_EXPERT. In addition, the rcupdate.rcu_cpu_stall_notifiers=1 kernel boot parameter must also be specified. The RCU_CPU_STALL_NOTIFIER Kconfig option's help text contains a warning and explains the dangers of careless use, recommending lockless notifier code. In addition, a WARN() is triggered each time that an attempt is made to register a stall-warning notifier in kernels built with CONFIG_RCU_CPU_STALL_NOTIFIER=y. This combination of measures will keep use of this mechanism confined to debug kernels and away from routine deployments. [ paulmck: Apply Dan Carpenter feedback. ] Fixes: 5b404fdabacf ("rcu: Add RCU CPU stall notifier") Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) Signed-off-by: Neeraj Upadhyay (AMD) --- include/linux/rcu_notifier.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcu_notifier.h b/include/linux/rcu_notifier.h index ebf371364581d..5640f024773b3 100644 --- a/include/linux/rcu_notifier.h +++ b/include/linux/rcu_notifier.h @@ -13,7 +13,7 @@ #define RCU_STALL_NOTIFY_NORM 1 #define RCU_STALL_NOTIFY_EXP 2 -#ifdef CONFIG_RCU_STALL_COMMON +#if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER) #include #include @@ -21,12 +21,12 @@ int rcu_stall_chain_notifier_register(struct notifier_block *n); int rcu_stall_chain_notifier_unregister(struct notifier_block *n); -#else // #ifdef CONFIG_RCU_STALL_COMMON +#else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER) // No RCU CPU stall warnings in Tiny RCU. static inline int rcu_stall_chain_notifier_register(struct notifier_block *n) { return -EEXIST; } static inline int rcu_stall_chain_notifier_unregister(struct notifier_block *n) { return -ENOENT; } -#endif // #else // #ifdef CONFIG_RCU_STALL_COMMON +#endif // #else // #if defined(CONFIG_RCU_STALL_COMMON) && defined(CONFIG_RCU_CPU_STALL_NOTIFIER) #endif /* __LINUX_RCU_NOTIFIER_H */ -- cgit v1.2.3 From 23d90b2404050c00c15058710d56bb46e1c5ab36 Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Fri, 20 Oct 2023 18:30:15 +0100 Subject: rcu: Remove unused macros from rcupdate.h ulong2long, USHORT_CMP_GE and USHORT_CMP_LT are redundant and have been unused for quite a few releases. Signed-off-by: Pedro Falcato Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- include/linux/rcupdate.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index f7206b2623c98..aa87c82236dd9 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -34,9 +34,6 @@ #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) -#define ulong2long(a) (*(long *)(&(a))) -#define USHORT_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b))) -#define USHORT_CMP_LT(a, b) (USHRT_MAX / 2 < (unsigned short)((a) - (b))) /* Exported common interfaces */ void call_rcu(struct rcu_head *head, rcu_callback_t func); -- cgit v1.2.3 From c5e2a973448d958feb7881e4d875eac59fdeff3d Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Fri, 8 Dec 2023 16:28:41 -0300 Subject: rtnl: add helper to check if rtnl group has listeners As of today, rtnl code creates a new skb and unconditionally fills and broadcasts it to the relevant group. For most operations this is okay and doesn't waste resources in general. When operations are done without the rtnl_lock, as in tc-flower, such skb allocation, message fill and no-op broadcasting can happen in all cores of the system, which contributes to system pressure and wastes precious cpu cycles when no one will receive the built message. Introduce this helper so rtnetlink operations can simply check if someone is listening and then proceed if necessary. Reviewed-by: Jiri Pirko Reviewed-by: Simon Horman Signed-off-by: Jamal Hadi Salim Signed-off-by: Victor Nogueira Signed-off-by: Pedro Tammela Link: https://lore.kernel.org/r/20231208192847.714940-2-pctammela@mojatatu.com Signed-off-by: Jakub Kicinski --- include/linux/rtnetlink.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 3d6cf306cd55e..a7d757e96c55f 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -130,4 +130,11 @@ extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, extern void rtnl_offload_xstats_notify(struct net_device *dev); +static inline int rtnl_has_listeners(const struct net *net, u32 group) +{ + struct sock *rtnl = net->rtnl; + + return netlink_has_listeners(rtnl, group); +} + #endif /* __LINUX_RTNETLINK_H */ -- cgit v1.2.3 From 8439109b76a3c405808383bf9dd532fc4b9c2dbd Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Fri, 8 Dec 2023 16:28:42 -0300 Subject: rtnl: add helper to check if a notification is needed Building on the rtnl_has_listeners helper, add the rtnl_notify_needed helper to check if we can bail out early in the notification routines. Reviewed-by: Jiri Pirko Reviewed-by: Simon Horman Signed-off-by: Victor Nogueira Signed-off-by: Pedro Tammela Link: https://lore.kernel.org/r/20231208192847.714940-3-pctammela@mojatatu.com Signed-off-by: Jakub Kicinski --- include/linux/rtnetlink.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index a7d757e96c55f..0cbbbded03319 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -137,4 +137,19 @@ static inline int rtnl_has_listeners(const struct net *net, u32 group) return netlink_has_listeners(rtnl, group); } +/** + * rtnl_notify_needed - check if notification is needed + * @net: Pointer to the net namespace + * @nlflags: netlink ingress message flags + * @group: rtnl group + * + * Based on the ingress message flags and rtnl group, returns true + * if a notification is needed, false otherwise. + */ +static inline bool +rtnl_notify_needed(const struct net *net, u16 nlflags, u32 group) +{ + return (nlflags & NLM_F_ECHO) || rtnl_has_listeners(net, group); +} + #endif /* __LINUX_RTNETLINK_H */ -- cgit v1.2.3 From ddb6b284bdc32b6e218b3d90b5a745ea26620812 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Fri, 8 Dec 2023 16:28:43 -0300 Subject: rtnl: add helper to send if skb is not null This is a convenience helper for routines handling conditional rtnl events, that is code that might send a notification depending on rtnl_has_listeners/rtnl_notify_needed. Instead of: if (skb) rtnetlink_send(...) Use: rtnetlink_maybe_send(...) Reviewed-by: Jiri Pirko Reviewed-by: Simon Horman Signed-off-by: Pedro Tammela Link: https://lore.kernel.org/r/20231208192847.714940-4-pctammela@mojatatu.com Signed-off-by: Jakub Kicinski --- include/linux/rtnetlink.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 0cbbbded03319..6a8543b34e2c0 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -10,6 +10,13 @@ #include extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); + +static inline int rtnetlink_maybe_send(struct sk_buff *skb, struct net *net, + u32 pid, u32 group, int echo) +{ + return !skb ? 0 : rtnetlink_send(skb, net, pid, group, echo); +} + extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid); extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, const struct nlmsghdr *nlh, gfp_t flags); -- cgit v1.2.3 From 1a1ad782dcbbacd9e8d4e2e7ff1bf14d1db80727 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 4 Dec 2023 15:39:21 -0800 Subject: bpf: tidy up exception callback management a bit Use the fact that we are passing subprog index around and have a corresponding struct bpf_subprog_info in bpf_verifier_env for each subprogram. We don't need to separately pass around a flag whether subprog is exception callback or not, each relevant verifier function can determine this using provided subprog index if we maintain bpf_subprog_info properly. Also move out exception callback-specific logic from btf_prepare_func_args(), keeping it generic. We can enforce all these restriction right before exception callback verification pass. We add out parameter, arg_cnt, for now, but this will be unnecessary with subsequent refactoring and will be removed. Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231204233931.49758-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c1a06263a4f36..0bd4889e917a3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2494,7 +2494,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *reg, bool is_ex_cb); + struct bpf_reg_state *reg, u32 *nargs); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt, -- cgit v1.2.3 From 406a6fa44bfbc8563f0612b08d43df2fa65e8bc5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 4 Dec 2023 15:39:22 -0800 Subject: bpf: use bitfields for simple per-subprog bool flags We have a bunch of bool flags for each subprog. Instead of wasting bytes for them, use bitfields instead. Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231204233931.49758-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 314b679fb4940..c2819a6579a52 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -611,12 +611,12 @@ struct bpf_subprog_info { u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ - bool has_tail_call; - bool tail_call_reachable; - bool has_ld_abs; - bool is_cb; - bool is_async_cb; - bool is_exception_cb; + bool has_tail_call: 1; + bool tail_call_reachable: 1; + bool has_ld_abs: 1; + bool is_cb: 1; + bool is_async_cb: 1; + bool is_exception_cb: 1; }; struct bpf_verifier_env; -- cgit v1.2.3 From 1ca51628e7303718fdabe29c7d36f582500d5cf2 Mon Sep 17 00:00:00 2001 From: Shun Hao Date: Wed, 6 Dec 2023 16:01:34 +0200 Subject: net/mlx5: Introduce indirect-sw-encap ICM properties Add new fields for device memory capabilities, in order to support creation of new ICM memory type of SW encap. Signed-off-by: Shun Hao Link: https://lore.kernel.org/r/107cca7dd6a932a1704abf6ebd1b801105546a8e.1701871118.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6f3631425f386..02b25dc361437 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1193,7 +1193,8 @@ struct mlx5_ifc_device_mem_cap_bits { u8 log_sw_icm_alloc_granularity[0x6]; u8 log_steering_sw_icm_size[0x8]; - u8 reserved_at_120[0x18]; + u8 log_indirect_encap_sw_icm_size[0x8]; + u8 reserved_at_128[0x10]; u8 log_header_modify_pattern_sw_icm_size[0x8]; u8 header_modify_sw_icm_start_address[0x40]; @@ -1204,7 +1205,11 @@ struct mlx5_ifc_device_mem_cap_bits { u8 memic_operations[0x20]; - u8 reserved_at_220[0x5e0]; + u8 reserved_at_220[0x20]; + + u8 indirect_encap_sw_icm_start_address[0x40]; + + u8 reserved_at_280[0x580]; }; struct mlx5_ifc_device_event_cap_bits { -- cgit v1.2.3 From a429ec96c07f3020af12029acefc46f42ff5c91c Mon Sep 17 00:00:00 2001 From: Shun Hao Date: Wed, 6 Dec 2023 16:01:35 +0200 Subject: RDMA/mlx5: Support handling of SW encap ICM area New type for this ICM area, now the user can allocate/deallocate the new type of SW encap ICM memory, to store the encap header data which are managed by SW. Signed-off-by: Shun Hao Link: https://lore.kernel.org/r/546fe43fc700240709e30acf7713ec6834d652bd.1701871118.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index d2b8d4a74a308..96cb8845682d2 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -688,6 +688,7 @@ enum mlx5_sw_icm_type { MLX5_SW_ICM_TYPE_STEERING, MLX5_SW_ICM_TYPE_HEADER_MODIFY, MLX5_SW_ICM_TYPE_HEADER_MODIFY_PATTERN, + MLX5_SW_ICM_TYPE_SW_ENCAP, }; #define MLX5_MAX_RESERVED_GIDS 8 -- cgit v1.2.3 From eb524d0fd46249b0b9e5d52372dc65d8b32430c3 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Wed, 6 Dec 2023 16:01:37 +0200 Subject: net/mlx5: E-Switch, expose eswitch manager vport Expose the ability the query the eswitch manager vport number. Next patch will utilize this capability to reveal the correct register C0 value to the users. Signed-off-by: Mark Bloch Link: https://lore.kernel.org/r/614fb0e216250e2ce3340471ec141b83ec45c7f4.1701871118.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/eswitch.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index 950d2431a53c8..df73a2ccc9af3 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -7,6 +7,7 @@ #define _MLX5_ESWITCH_ #include +#include #include #define MLX5_ESWITCH_MANAGER(mdev) MLX5_CAP_GEN(mdev, eswitch_manager) @@ -210,4 +211,11 @@ static inline bool is_mdev_switchdev_mode(struct mlx5_core_dev *dev) return mlx5_eswitch_mode(dev) == MLX5_ESWITCH_OFFLOADS; } +/* The returned number is valid only when the dev is eswitch manager. */ +static inline u16 mlx5_eswitch_manager_vport(struct mlx5_core_dev *dev) +{ + return mlx5_core_is_ecpf_esw_manager(dev) ? + MLX5_VPORT_ECPF : MLX5_VPORT_PF; +} + #endif -- cgit v1.2.3 From 98fb9b9680c9f3895ced02d6a73e27f5d7b5892b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 6 Dec 2023 22:37:57 +0100 Subject: wifi: ieee80211: don't require protected vendor action frames For vendor action frames, whether a protected one should be used or not is clearly up to the individual vendor and frame, so even though a protected dual is defined, it may not get used. Thus, don't require protection for vendor action frames when they're used in a connection. Since we obviously don't process frames unknown to the kernel in the kernel, it may makes sense to invert this list to have all the ones the kernel processes and knows to be requiring protection, but that'd be a different change. Fixes: 91535613b609 ("wifi: mac80211: don't drop all unprotected public action frames") Reported-by: Jouni Malinen Link: https://msgid.link/20231206223801.f6a2cf4e67ec.Ifa6acc774bd67801d3dafb405278f297683187aa@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index c2ac9e9e7ee9a..2b5e500bf0930 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4447,7 +4447,8 @@ ieee80211_is_protected_dual_of_public_action(struct sk_buff *skb) action != WLAN_PUB_ACTION_LOC_TRACK_NOTI && action != WLAN_PUB_ACTION_FTM_REQUEST && action != WLAN_PUB_ACTION_FTM_RESPONSE && - action != WLAN_PUB_ACTION_FILS_DISCOVERY; + action != WLAN_PUB_ACTION_FILS_DISCOVERY && + action != WLAN_PUB_ACTION_VENDOR_SPECIFIC; } /** -- cgit v1.2.3 From 8f23f5dba6b4693448144bde4dd6f537543442c2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 27 Oct 2023 08:05:20 +0800 Subject: iommu: Change kconfig around IOMMU_SVA Linus suggested that the kconfig here is confusing: https://lore.kernel.org/all/CAHk-=wgUiAtiszwseM1p2fCJ+sC4XWQ+YN4TanFhUgvUqjr9Xw@mail.gmail.com/ Let's break it into three kconfigs controlling distinct things: - CONFIG_IOMMU_MM_DATA controls if the mm_struct has the additional fields for the IOMMU. Currently only PASID, but later patches store a struct iommu_mm_data * - CONFIG_ARCH_HAS_CPU_PASID controls if the arch needs the scheduling bit for keeping track of the ENQCMD instruction. x86 will select this if IOMMU_SVA is enabled - IOMMU_SVA controls if the IOMMU core compiles in the SVA support code for iommu driver use and the IOMMU exported API This way ARM will not enable CONFIG_ARCH_HAS_CPU_PASID Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231027000525.1278806-2-tina.zhang@intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 +- include/linux/mm_types.h | 2 +- include/linux/sched.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c7394b39599c8..cd3f398095bf3 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1337,7 +1337,7 @@ static inline bool tegra_dev_iommu_get_stream_id(struct device *dev, u32 *stream return false; } -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_IOMMU_MM_DATA static inline void mm_pasid_init(struct mm_struct *mm) { mm->pasid = IOMMU_PASID_INVALID; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 957ce38768b2a..41f248608dd98 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -938,7 +938,7 @@ struct mm_struct { #endif struct work_struct async_put_work; -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_IOMMU_MM_DATA u32 pasid; #endif #ifdef CONFIG_KSM diff --git a/include/linux/sched.h b/include/linux/sched.h index 292c316972485..70888a36677b6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -954,7 +954,7 @@ struct task_struct { /* Recursion prevention for eventfd_signal() */ unsigned in_eventfd:1; #endif -#ifdef CONFIG_IOMMU_SVA +#ifdef CONFIG_ARCH_HAS_CPU_PASID unsigned pasid_activated:1; #endif #ifdef CONFIG_CPU_SUP_INTEL -- cgit v1.2.3 From 2396046d75d3c0b2cfead852a77efd023f8539dc Mon Sep 17 00:00:00 2001 From: Tina Zhang Date: Fri, 27 Oct 2023 08:05:22 +0800 Subject: iommu: Add mm_get_enqcmd_pasid() helper function mm_get_enqcmd_pasid() should be used by architecture code and closely related to learn the PASID value that the x86 ENQCMD operation should use for the mm. For the moment SMMUv3 uses this without any connection to ENQCMD, it will be cleaned up similar to how the prior patch made VT-d use the PASID argument of set_dev_pasid(). The motivation is to replace mm->pasid with an iommu private data structure that is introduced in a later patch. Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Tested-by: Nicolin Chen Signed-off-by: Tina Zhang Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231027000525.1278806-4-tina.zhang@intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index cd3f398095bf3..4fb239c6ca8d5 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1346,6 +1346,12 @@ static inline bool mm_valid_pasid(struct mm_struct *mm) { return mm->pasid != IOMMU_PASID_INVALID; } + +static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) +{ + return mm->pasid; +} + void mm_pasid_drop(struct mm_struct *mm); struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm); @@ -1368,6 +1374,12 @@ static inline u32 iommu_sva_get_pasid(struct iommu_sva *handle) } static inline void mm_pasid_init(struct mm_struct *mm) {} static inline bool mm_valid_pasid(struct mm_struct *mm) { return false; } + +static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) +{ + return IOMMU_PASID_INVALID; +} + static inline void mm_pasid_drop(struct mm_struct *mm) {} #endif /* CONFIG_IOMMU_SVA */ -- cgit v1.2.3 From 541a3e257d48c16b77d19f39ed939ef5832046df Mon Sep 17 00:00:00 2001 From: Tina Zhang Date: Fri, 27 Oct 2023 08:05:23 +0800 Subject: mm: Add structure to keep sva information Introduce iommu_mm_data structure to keep sva information (pasid and the related sva domains). Add iommu_mm pointer, pointing to an instance of iommu_mm_data structure, to mm. Reviewed-by: Vasant Hegde Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Tested-by: Nicolin Chen Signed-off-by: Tina Zhang Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231027000525.1278806-5-tina.zhang@intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 5 +++++ include/linux/mm_types.h | 2 ++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 4fb239c6ca8d5..f7b1b469e98d6 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -812,6 +812,11 @@ struct iommu_sva { struct iommu_domain *domain; }; +struct iommu_mm_data { + u32 pasid; + struct list_head sva_domains; +}; + int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, const struct iommu_ops *ops); void iommu_fwspec_free(struct device *dev); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 41f248608dd98..0b4314fab4787 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -727,6 +727,7 @@ struct mm_cid { #endif struct kioctx_table; +struct iommu_mm_data; struct mm_struct { struct { /* @@ -940,6 +941,7 @@ struct mm_struct { #ifdef CONFIG_IOMMU_MM_DATA u32 pasid; + struct iommu_mm_data *iommu_mm; #endif #ifdef CONFIG_KSM /* -- cgit v1.2.3 From 092edaddb660376648acb97678570ed5d8299768 Mon Sep 17 00:00:00 2001 From: Tina Zhang Date: Fri, 27 Oct 2023 08:05:24 +0800 Subject: iommu: Support mm PASID 1:n with sva domains Each mm bound to devices gets a PASID and corresponding sva domains allocated in iommu_sva_bind_device(), which are referenced by iommu_mm field of the mm. The PASID is released in __mmdrop(), while a sva domain is released when no one is using it (the reference count is decremented in iommu_sva_unbind_device()). However, although sva domains and their PASID are separate objects such that their own life cycles could be handled independently, an enqcmd use case may require releasing the PASID in releasing the mm (i.e., once a PASID is allocated for a mm, it will be permanently used by the mm and won't be released until the end of mm) and only allows to drop the PASID after the sva domains are released. To this end, mmgrab() is called in iommu_sva_domain_alloc() to increment the mm reference count and mmdrop() is invoked in iommu_domain_free() to decrement the mm reference count. Since the required info of PASID and sva domains is kept in struct iommu_mm_data of a mm, use mm->iommu_mm field instead of the old pasid field in mm struct. The sva domain list is protected by iommu_sva_lock. Besides, this patch removes mm_pasid_init(), as with the introduced iommu_mm structure, initializing mm pasid in mm_init() is unnecessary. Reviewed-by: Lu Baolu Reviewed-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Tested-by: Nicolin Chen Signed-off-by: Tina Zhang Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231027000525.1278806-6-tina.zhang@intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index f7b1b469e98d6..c6bbbe0901d0c 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -121,6 +121,11 @@ struct iommu_domain { struct { /* IOMMU_DOMAIN_SVA */ struct mm_struct *mm; int users; + /* + * Next iommu_domain in mm->iommu_mm->sva-domains list + * protected by iommu_sva_lock. + */ + struct list_head next; }; }; }; @@ -1345,16 +1350,28 @@ static inline bool tegra_dev_iommu_get_stream_id(struct device *dev, u32 *stream #ifdef CONFIG_IOMMU_MM_DATA static inline void mm_pasid_init(struct mm_struct *mm) { - mm->pasid = IOMMU_PASID_INVALID; + /* + * During dup_mm(), a new mm will be memcpy'd from an old one and that makes + * the new mm and the old one point to a same iommu_mm instance. When either + * one of the two mms gets released, the iommu_mm instance is freed, leaving + * the other mm running into a use-after-free/double-free problem. To avoid + * the problem, zeroing the iommu_mm pointer of a new mm is needed here. + */ + mm->iommu_mm = NULL; } + static inline bool mm_valid_pasid(struct mm_struct *mm) { - return mm->pasid != IOMMU_PASID_INVALID; + return READ_ONCE(mm->iommu_mm); } static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) { - return mm->pasid; + struct iommu_mm_data *iommu_mm = READ_ONCE(mm->iommu_mm); + + if (!iommu_mm) + return IOMMU_PASID_INVALID; + return iommu_mm->pasid; } void mm_pasid_drop(struct mm_struct *mm); -- cgit v1.2.3 From 1fa05c932dc71c474da38e4fd0456131128f8486 Mon Sep 17 00:00:00 2001 From: Tina Zhang Date: Fri, 27 Oct 2023 08:05:25 +0800 Subject: mm: Deprecate pasid field Drop the pasid field, as all the information needed for sva domain management has been moved to the newly added iommu_mm field. Reviewed-by: Lu Baolu Reviewed-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Signed-off-by: Tina Zhang Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20231027000525.1278806-7-tina.zhang@intel.com Signed-off-by: Joerg Roedel --- include/linux/mm_types.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0b4314fab4787..ec71c91e210ba 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -940,7 +940,6 @@ struct mm_struct { struct work_struct async_put_work; #ifdef CONFIG_IOMMU_MM_DATA - u32 pasid; struct iommu_mm_data *iommu_mm; #endif #ifdef CONFIG_KSM -- cgit v1.2.3 From 4720287c7bf76e59d19d4dfbdc3f54eeea6fd46b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 7 Dec 2023 14:03:08 -0400 Subject: iommu: Remove struct iommu_ops *iommu from arch_setup_dma_ops() This is not being used to pass ops, it is just a way to tell if an iommu driver was probed. These days this can be detected directly via device_iommu_mapped(). Call device_iommu_mapped() in the two places that need to check it and remove the iommu parameter everywhere. Reviewed-by: Jerry Snitselaar Reviewed-by: Lu Baolu Reviewed-by: Moritz Fischer Acked-by: Christoph Hellwig Acked-by: Rob Herring Tested-by: Hector Martin Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com Signed-off-by: Joerg Roedel --- include/linux/dma-map-ops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index a52e508d1869f..e9cc317e9d7de 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -427,10 +427,10 @@ bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg, #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, - const struct iommu_ops *iommu, bool coherent); + bool coherent); #else static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base, - u64 size, const struct iommu_ops *iommu, bool coherent) + u64 size, bool coherent) { } #endif /* CONFIG_ARCH_HAS_SETUP_DMA_OPS */ -- cgit v1.2.3 From 6ff6e184f1f4d4993d45ca3f934c8288890965fe Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 7 Dec 2023 14:03:09 -0400 Subject: iommmu/of: Do not return struct iommu_ops from of_iommu_configure() Nothing needs this pointer. Return a normal error code with the usual IOMMU semantic that ENODEV means 'there is no IOMMU driver'. Reviewed-by: Jerry Snitselaar Reviewed-by: Lu Baolu Acked-by: Rob Herring Tested-by: Hector Martin Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com Signed-off-by: Joerg Roedel --- include/linux/of_iommu.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_iommu.h b/include/linux/of_iommu.h index 9a5e6b410dd2f..e61cbbe12dac6 100644 --- a/include/linux/of_iommu.h +++ b/include/linux/of_iommu.h @@ -8,20 +8,19 @@ struct iommu_ops; #ifdef CONFIG_OF_IOMMU -extern const struct iommu_ops *of_iommu_configure(struct device *dev, - struct device_node *master_np, - const u32 *id); +extern int of_iommu_configure(struct device *dev, struct device_node *master_np, + const u32 *id); extern void of_iommu_get_resv_regions(struct device *dev, struct list_head *list); #else -static inline const struct iommu_ops *of_iommu_configure(struct device *dev, - struct device_node *master_np, - const u32 *id) +static inline int of_iommu_configure(struct device *dev, + struct device_node *master_np, + const u32 *id) { - return NULL; + return -ENODEV; } static inline void of_iommu_get_resv_regions(struct device *dev, -- cgit v1.2.3 From eda1a94caf6b05482bbf57dc244e7a31a9dba77c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 7 Dec 2023 14:03:12 -0400 Subject: iommu: Mark dev_iommu_priv_set() with a lockdep A perfect driver would only call dev_iommu_priv_set() from its probe callback. We've made it functionally correct to call it from the of_xlate by adding a lock around that call. lockdep assert that iommu_probe_device_lock is held to discourage misuse. Exclude PPC kernels with CONFIG_FSL_PAMU turned on because FSL_PAMU uses a global static for its priv and abuses priv for its domain. Remove the pointless stores of NULL, all these are on paths where the core code will free dev->iommu after the op returns. Reviewed-by: Lu Baolu Reviewed-by: Jerry Snitselaar Tested-by: Hector Martin Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5-v2-16e4def25ebb+820-iommu_fwspec_p1_jgg@nvidia.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c6bbbe0901d0c..3a556996fea7f 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -850,10 +850,7 @@ static inline void *dev_iommu_priv_get(struct device *dev) return NULL; } -static inline void dev_iommu_priv_set(struct device *dev, void *priv) -{ - dev->iommu->priv = priv; -} +void dev_iommu_priv_set(struct device *dev, void *priv); extern struct mutex iommu_probe_device_lock; int iommu_probe_device(struct device *dev); -- cgit v1.2.3 From 62e1f212e5fe7624249212813ee96202e0c31430 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 11 Dec 2023 16:13:14 +0000 Subject: arm: perf/kvm: Use GENMASK for ARMV8_PMU_PMCR_N This is so that FIELD_GET and FIELD_PREP can be used and that the fields are in a consistent format to arm64/tools/sysreg Signed-off-by: James Clark Link: https://lore.kernel.org/r/20231211161331.1277825-3-james.clark@arm.com Signed-off-by: Will Deacon --- include/linux/perf/arm_pmuv3.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index 9c226adf938a2..ed62bd75cec7a 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -215,8 +215,7 @@ #define ARMV8_PMU_PMCR_DP (1 << 5) /* Disable CCNT if non-invasive debug*/ #define ARMV8_PMU_PMCR_LC (1 << 6) /* Overflow on 64 bit cycle counter */ #define ARMV8_PMU_PMCR_LP (1 << 7) /* Long event counter enable */ -#define ARMV8_PMU_PMCR_N_SHIFT 11 /* Number of counters supported */ -#define ARMV8_PMU_PMCR_N_MASK 0x1f +#define ARMV8_PMU_PMCR_N GENMASK(15, 11) /* Number of counters supported */ #define ARMV8_PMU_PMCR_MASK 0xff /* Mask for writable bits */ /* -- cgit v1.2.3 From 2f6a00f30600417ee2737f2b1229c75663f1e3c9 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 11 Dec 2023 16:13:15 +0000 Subject: arm: perf: Use GENMASK for PMMIR fields This is so that FIELD_GET and FIELD_PREP can be used and that the fields are in a consistent format to arm64/tools/sysreg Signed-off-by: James Clark Link: https://lore.kernel.org/r/20231211161331.1277825-4-james.clark@arm.com Signed-off-by: Will Deacon --- include/linux/perf/arm_pmuv3.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index ed62bd75cec7a..1bc7678c10d4f 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -250,12 +250,9 @@ #define ARMV8_PMU_USERENR_ER (1 << 3) /* Event counter can be read at EL0 */ /* PMMIR_EL1.SLOTS mask */ -#define ARMV8_PMU_SLOTS_MASK 0xff - -#define ARMV8_PMU_BUS_SLOTS_SHIFT 8 -#define ARMV8_PMU_BUS_SLOTS_MASK 0xff -#define ARMV8_PMU_BUS_WIDTH_SHIFT 16 -#define ARMV8_PMU_BUS_WIDTH_MASK 0xf +#define ARMV8_PMU_SLOTS GENMASK(7, 0) +#define ARMV8_PMU_BUS_SLOTS GENMASK(15, 8) +#define ARMV8_PMU_BUS_WIDTH GENMASK(19, 16) /* * This code is really good -- cgit v1.2.3 From d30f09b6d7de5d159dbb537f9d67dceb67409420 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 11 Dec 2023 16:13:16 +0000 Subject: arm: perf: Convert remaining fields to use GENMASK Convert the remaining fields to use either GENMASK or be built from other fields. These all already started at bit 0 so don't need a code change for the lack of _SHIFT. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20231211161331.1277825-5-james.clark@arm.com Signed-off-by: Will Deacon --- include/linux/perf/arm_pmuv3.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index 1bc7678c10d4f..daa63542242dd 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -216,19 +216,25 @@ #define ARMV8_PMU_PMCR_LC (1 << 6) /* Overflow on 64 bit cycle counter */ #define ARMV8_PMU_PMCR_LP (1 << 7) /* Long event counter enable */ #define ARMV8_PMU_PMCR_N GENMASK(15, 11) /* Number of counters supported */ -#define ARMV8_PMU_PMCR_MASK 0xff /* Mask for writable bits */ +/* Mask for writable bits */ +#define ARMV8_PMU_PMCR_MASK (ARMV8_PMU_PMCR_E | ARMV8_PMU_PMCR_P | \ + ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_D | \ + ARMV8_PMU_PMCR_X | ARMV8_PMU_PMCR_DP | \ + ARMV8_PMU_PMCR_LC | ARMV8_PMU_PMCR_LP) /* * PMOVSR: counters overflow flag status reg */ -#define ARMV8_PMU_OVSR_MASK 0xffffffff /* Mask for writable bits */ -#define ARMV8_PMU_OVERFLOWED_MASK ARMV8_PMU_OVSR_MASK +#define ARMV8_PMU_OVSR_P GENMASK(30, 0) +#define ARMV8_PMU_OVSR_C BIT(31) +/* Mask for writable bits is both P and C fields */ +#define ARMV8_PMU_OVERFLOWED_MASK (ARMV8_PMU_OVSR_P | ARMV8_PMU_OVSR_C) /* * PMXEVTYPER: Event selection reg */ #define ARMV8_PMU_EVTYPE_MASK 0xc800ffff /* Mask for writable bits */ -#define ARMV8_PMU_EVTYPE_EVENT 0xffff /* Mask for EVENT bits */ +#define ARMV8_PMU_EVTYPE_EVENT GENMASK(15, 0) /* Mask for EVENT bits */ /* * Event filters for PMUv3 @@ -243,11 +249,13 @@ /* * PMUSERENR: user enable reg */ -#define ARMV8_PMU_USERENR_MASK 0xf /* Mask for writable bits */ #define ARMV8_PMU_USERENR_EN (1 << 0) /* PMU regs can be accessed at EL0 */ #define ARMV8_PMU_USERENR_SW (1 << 1) /* PMSWINC can be written at EL0 */ #define ARMV8_PMU_USERENR_CR (1 << 2) /* Cycle counter can be read at EL0 */ #define ARMV8_PMU_USERENR_ER (1 << 3) /* Event counter can be read at EL0 */ +/* Mask for writable bits */ +#define ARMV8_PMU_USERENR_MASK (ARMV8_PMU_USERENR_EN | ARMV8_PMU_USERENR_SW | \ + ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_ER) /* PMMIR_EL1.SLOTS mask */ #define ARMV8_PMU_SLOTS GENMASK(7, 0) -- cgit v1.2.3 From 3115ee021bfb04efde2e96507bfcc1330261a6a1 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 11 Dec 2023 16:13:17 +0000 Subject: arm64: perf: Include threshold control fields in PMEVTYPER mask FEAT_PMUv3_TH (Armv8.8) adds two new fields to PMEVTYPER, so include them in the mask. These aren't writable on 32 bit kernels as they are in the high part of the register, so only include them for arm64. It would be difficult to do this statically in the asm header files for each platform without resulting in circular includes or #ifdefs inline in the code. For that reason the ARMV8_PMU_EVTYPE_MASK definition has been removed and the mask is constructed programmatically. Reviewed-by: Suzuki K Poulose Reviewed-by: Anshuman Khandual Signed-off-by: James Clark Link: https://lore.kernel.org/r/20231211161331.1277825-6-james.clark@arm.com Signed-off-by: Will Deacon --- include/linux/perf/arm_pmuv3.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index daa63542242dd..91957b3468e9a 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -233,8 +233,9 @@ /* * PMXEVTYPER: Event selection reg */ -#define ARMV8_PMU_EVTYPE_MASK 0xc800ffff /* Mask for writable bits */ #define ARMV8_PMU_EVTYPE_EVENT GENMASK(15, 0) /* Mask for EVENT bits */ +#define ARMV8_PMU_EVTYPE_TH GENMASK(43, 32) +#define ARMV8_PMU_EVTYPE_TC GENMASK(63, 61) /* * Event filters for PMUv3 -- cgit v1.2.3 From f6da86969a3c284466ab6080764b2ed91689f262 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 11 Dec 2023 16:13:18 +0000 Subject: arm: pmu: Share user ABI format mechanism with SPE This mechanism makes it much easier to define and read new attributes so move it to the arm_pmu.h header so that it can be shared. At the same time update the existing format attributes to use it. GENMASK has to be changed to GENMASK_ULL because the config fields are 64 bits even on arm32 where this will also be used now. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20231211161331.1277825-7-james.clark@arm.com Signed-off-by: Will Deacon --- include/linux/perf/arm_pmu.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index e2503d48ddee6..b3b34f6670cfb 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -183,4 +183,26 @@ void armpmu_free_irq(int irq, int cpu); #define ARMV8_SPE_PDEV_NAME "arm,spe-v1" #define ARMV8_TRBE_PDEV_NAME "arm,trbe" +/* Why does everything I do descend into this? */ +#define __GEN_PMU_FORMAT_ATTR(cfg, lo, hi) \ + (lo) == (hi) ? #cfg ":" #lo "\n" : #cfg ":" #lo "-" #hi + +#define _GEN_PMU_FORMAT_ATTR(cfg, lo, hi) \ + __GEN_PMU_FORMAT_ATTR(cfg, lo, hi) + +#define GEN_PMU_FORMAT_ATTR(name) \ + PMU_FORMAT_ATTR(name, \ + _GEN_PMU_FORMAT_ATTR(ATTR_CFG_FLD_##name##_CFG, \ + ATTR_CFG_FLD_##name##_LO, \ + ATTR_CFG_FLD_##name##_HI)) + +#define _ATTR_CFG_GET_FLD(attr, cfg, lo, hi) \ + ((((attr)->cfg) >> lo) & GENMASK_ULL(hi - lo, 0)) + +#define ATTR_CFG_GET_FLD(attr, name) \ + _ATTR_CFG_GET_FLD(attr, \ + ATTR_CFG_FLD_##name##_CFG, \ + ATTR_CFG_FLD_##name##_LO, \ + ATTR_CFG_FLD_##name##_HI) + #endif /* __ARM_PMU_H__ */ -- cgit v1.2.3 From 816c26754447e8b28d6c604e1f5b1d205b2586ee Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 11 Dec 2023 16:13:22 +0000 Subject: arm64: perf: Add support for event counting threshold FEAT_PMUv3_TH (Armv8.8) permits a PMU counter to increment only on events whose count meets a specified threshold condition. For example if PMEVTYPERn.TC (Threshold Control) is set to 0b101 (Greater than or equal, count), and the threshold is set to 2, then the PMU counter will now only increment by 1 when an event would have previously incremented the PMU counter by 2 or more on a single processor cycle. Three new Perf event config fields, 'threshold', 'threshold_compare' and 'threshold_count' have been added to control the feature. threshold_compare maps to the upper two bits of PMEVTYPERn.TC and threshold_count maps to the first bit of TC. These separate attributes have been picked rather than enumerating all the possible combinations of the TC field as in the Arm ARM. The attributes would be used on a Perf command line like this: $ perf stat -e stall_slot/threshold=2,threshold_compare=2/ A new capability for reading out the maximum supported threshold value has also been added: $ cat /sys/bus/event_source/devices/armv8_pmuv3/caps/threshold_max 0x000000ff If a threshold higher than threshold_max is provided, then an error is generated. If FEAT_PMUv3_TH isn't implemented or a 32 bit kernel is running, then threshold_max reads zero, and attempting to set a threshold value will also result in an error. The threshold is per PMU counter, and there are potentially different threshold_max values per PMU type on heterogeneous systems. Bits higher than 32 now need to be written into PMEVTYPER, so armv8pmu_write_evtype() has to be updated to take an unsigned long value rather than u32 which gives the correct behavior on both aarch32 and 64. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20231211161331.1277825-11-james.clark@arm.com Signed-off-by: Will Deacon --- include/linux/perf/arm_pmuv3.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index 91957b3468e9a..0f4d62ef3a9a1 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -262,6 +262,7 @@ #define ARMV8_PMU_SLOTS GENMASK(7, 0) #define ARMV8_PMU_BUS_SLOTS GENMASK(15, 8) #define ARMV8_PMU_BUS_WIDTH GENMASK(19, 16) +#define ARMV8_PMU_THWIDTH GENMASK(23, 20) /* * This code is really good -- cgit v1.2.3 From 253ca8678d30bcf94410b54476fc1e0f1627a137 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 26 Nov 2023 12:24:38 -0800 Subject: Improve __fget_files_rcu() code generation (and thus __fget_light()) Commit 0ede61d8589c ("file: convert to SLAB_TYPESAFE_BY_RCU") caused a performance regression as reported by the kernel test robot. The __fget_light() function is one of those critical ones for some loads, and the code generation was unnecessarily impacted. Let's just write that function to better. Reported-by: kernel test robot Cc: Christian Brauner Cc: Jann Horn Cc: Mateusz Guzik Closes: https://lore.kernel.org/oe-lkp/202311201406.2022ca3f-oliver.sang@intel.com Signed-off-by: Linus Torvalds Link: https://lore.kernel.org/r/CAHk-=wiCJtLbFWNURB34b9a_R_unaH3CiMRXfkR0-iihB_z68A@mail.gmail.com Signed-off-by: Christian Brauner --- include/linux/fdtable.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index bc4c3287a65ef..80bd7789bab15 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -83,12 +83,17 @@ struct dentry; static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = rcu_dereference_raw(files->fdt); - - if (fd < fdt->max_fds) { - fd = array_index_nospec(fd, fdt->max_fds); - return rcu_dereference_raw(fdt->fd[fd]); - } - return NULL; + unsigned long mask = array_index_mask_nospec(fd, fdt->max_fds); + struct file *needs_masking; + + /* + * 'mask' is zero for an out-of-bounds fd, all ones for ok. + * 'fd&mask' is 'fd' for ok, or 0 for out of bounds. + * + * Accessing fdt->fd[0] is ok, but needs masking of the result. + */ + needs_masking = rcu_dereference_raw(fdt->fd[fd&mask]); + return (struct file *)(mask & (unsigned long)needs_masking); } static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd) -- cgit v1.2.3 From a88c955fcfb49727d0ed86b47410f6555a8e69e4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 30 Nov 2023 13:49:07 +0100 Subject: file: s/close_fd_get_file()/file_close_fd()/g That really shouldn't have "get" in there as that implies we're bumping the reference count which we don't do at all. We used to but not anmore. Now we're just closing the fd and pick that file from the fdtable without bumping the reference count. Update the wrong documentation while at it. Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-1-e73ca6f4ea83@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/fdtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 80bd7789bab15..78c8326d74ae2 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -119,7 +119,7 @@ int iterate_fd(struct files_struct *, unsigned, extern int close_fd(unsigned int fd); extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); -extern struct file *close_fd_get_file(unsigned int fd); +extern struct file *file_close_fd(unsigned int fd); extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, struct files_struct **new_fdp); -- cgit v1.2.3 From 372a34e66fb7f95124fadae9c600b231c35696a7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 30 Nov 2023 13:49:09 +0100 Subject: fs: replace f_rcuhead with f_task_work The naming is actively misleading since we switched to SLAB_TYPESAFE_BY_RCU. rcu_head is #define callback_head. Use callback_head directly and rename f_rcuhead to f_task_work. Add comments in there to explain what it's used for. Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-3-e73ca6f4ea83@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/fs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e3..354fd02e0e111 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -991,8 +991,10 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) */ struct file { union { + /* fput() uses task work when closing and freeing file (default). */ + struct callback_head f_task_work; + /* fput() must use workqueue (most kernel threads). */ struct llist_node f_llist; - struct rcu_head f_rcuhead; unsigned int f_iocb_flags; }; -- cgit v1.2.3 From eac9189c96196574a83a553ca5a7543dd9f5fe3e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 30 Nov 2023 13:49:10 +0100 Subject: file: stop exposing receive_fd_user() Not every subsystem needs to have their own specialized helper. Just us the __receive_fd() helper. Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-4-e73ca6f4ea83@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/file.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/file.h b/include/linux/file.h index 6e9099d293436..c0d5219c28528 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -101,13 +101,6 @@ extern int __receive_fd(struct file *file, int __user *ufd, extern int receive_fd(struct file *file, unsigned int o_flags); -static inline int receive_fd_user(struct file *file, int __user *ufd, - unsigned int o_flags) -{ - if (ufd == NULL) - return -EFAULT; - return __receive_fd(file, ufd, o_flags); -} int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags); extern void flush_delayed_fput(void); -- cgit v1.2.3 From 4e94ddfe2aab72139acb8d5372fac9e6c3f3e383 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 30 Nov 2023 13:49:11 +0100 Subject: file: remove __receive_fd() Honestly, there's little value in having a helper with and without that int __user *ufd argument. It's just messy and doesn't really give us anything. Just expose receive_fd() with that argument and get rid of that helper. Link: https://lore.kernel.org/r/20231130-vfs-files-fixes-v1-5-e73ca6f4ea83@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/file.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/file.h b/include/linux/file.h index c0d5219c28528..6834a29338c43 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -96,10 +96,7 @@ DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), extern void fd_install(unsigned int fd, struct file *file); -extern int __receive_fd(struct file *file, int __user *ufd, - unsigned int o_flags); - -extern int receive_fd(struct file *file, unsigned int o_flags); +int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags); -- cgit v1.2.3 From b66509b8497f2b002a2654e386a440f1274ddcc7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 1 Dec 2023 00:57:35 +0000 Subject: io_uring: split out cmd api into a separate header linux/io_uring.h is slowly becoming a rubbish bin where we put anything exposed to other subsystems. For instance, the task exit hooks and io_uring cmd infra are completely orthogonal and don't need each other's definitions. Start cleaning it up by splitting out all command bits into a new header file. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7ec50bae6e21f371d3850796e716917fc141225a.1701391955.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 89 +----------------------------------------- include/linux/io_uring/cmd.h | 81 ++++++++++++++++++++++++++++++++++++++ include/linux/io_uring_types.h | 20 ++++++++++ 3 files changed, 102 insertions(+), 88 deletions(-) create mode 100644 include/linux/io_uring/cmd.h (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index aefb73eeeebff..d8fc93492dc50 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -6,71 +6,13 @@ #include #include -enum io_uring_cmd_flags { - IO_URING_F_COMPLETE_DEFER = 1, - IO_URING_F_UNLOCKED = 2, - /* the request is executed from poll, it should not be freed */ - IO_URING_F_MULTISHOT = 4, - /* executed by io-wq */ - IO_URING_F_IOWQ = 8, - /* int's last bit, sign checks are usually faster than a bit test */ - IO_URING_F_NONBLOCK = INT_MIN, - - /* ctx state flags, for URING_CMD */ - IO_URING_F_SQE128 = (1 << 8), - IO_URING_F_CQE32 = (1 << 9), - IO_URING_F_IOPOLL = (1 << 10), - - /* set when uring wants to cancel a previously issued command */ - IO_URING_F_CANCEL = (1 << 11), - IO_URING_F_COMPAT = (1 << 12), -}; - -/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */ -#define IORING_URING_CMD_CANCELABLE (1U << 30) -#define IORING_URING_CMD_POLLED (1U << 31) - -struct io_uring_cmd { - struct file *file; - const struct io_uring_sqe *sqe; - union { - /* callback to defer completions to task context */ - void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); - /* used for polled completion */ - void *cookie; - }; - u32 cmd_op; - u32 flags; - u8 pdu[32]; /* available inline for free use */ -}; - -static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) -{ - return sqe->cmd; -} - #if defined(CONFIG_IO_URING) -int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd); -void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, - unsigned issue_flags); struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); void io_uring_unreg_ringfd(void); const char *io_uring_get_opcode(u8 opcode); -void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned), - unsigned flags); -/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */ -void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)); - -static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) -{ - __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); -} +int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); static inline void io_uring_files_cancel(void) { @@ -89,28 +31,7 @@ static inline void io_uring_free(struct task_struct *tsk) if (tsk->io_uring) __io_uring_free(tsk); } -int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); -void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, - unsigned int issue_flags); -struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd); #else -static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd) -{ - return -EOPNOTSUPP; -} -static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, - ssize_t ret2, unsigned issue_flags) -{ -} -static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) -{ -} -static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) -{ -} static inline struct sock *io_uring_get_socket(struct file *file) { return NULL; @@ -133,14 +54,6 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd, { return -EOPNOTSUPP; } -static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, - unsigned int issue_flags) -{ -} -static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) -{ - return NULL; -} #endif #endif diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h new file mode 100644 index 0000000000000..62fcfaf6fcc98 --- /dev/null +++ b/include/linux/io_uring/cmd.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _LINUX_IO_URING_CMD_H +#define _LINUX_IO_URING_CMD_H + +#include +#include + +/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */ +#define IORING_URING_CMD_CANCELABLE (1U << 30) +#define IORING_URING_CMD_POLLED (1U << 31) + +struct io_uring_cmd { + struct file *file; + const struct io_uring_sqe *sqe; + union { + /* callback to defer completions to task context */ + void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); + /* used for polled completion */ + void *cookie; + }; + u32 cmd_op; + u32 flags; + u8 pdu[32]; /* available inline for free use */ +}; + +static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) +{ + return sqe->cmd; +} + +#if defined(CONFIG_IO_URING) +int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, + struct iov_iter *iter, void *ioucmd); +void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, + unsigned issue_flags); +void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned), + unsigned flags); +/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */ +void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)); + +static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ + __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); +} + +void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, + unsigned int issue_flags); +struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd); + +#else +static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, + struct iov_iter *iter, void *ioucmd) +{ + return -EOPNOTSUPP; +} +static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, + ssize_t ret2, unsigned issue_flags) +{ +} +static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ +} +static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ +} +static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ +} +static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) +{ + return NULL; +} +#endif + +#endif /* _LINUX_IO_URING_CMD_H */ diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 805bb635cdf55..8c807bcc8b2b0 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -7,6 +7,26 @@ #include #include +enum io_uring_cmd_flags { + IO_URING_F_COMPLETE_DEFER = 1, + IO_URING_F_UNLOCKED = 2, + /* the request is executed from poll, it should not be freed */ + IO_URING_F_MULTISHOT = 4, + /* executed by io-wq */ + IO_URING_F_IOWQ = 8, + /* int's last bit, sign checks are usually faster than a bit test */ + IO_URING_F_NONBLOCK = INT_MIN, + + /* ctx state flags, for URING_CMD */ + IO_URING_F_SQE128 = (1 << 8), + IO_URING_F_CQE32 = (1 << 9), + IO_URING_F_IOPOLL = (1 << 10), + + /* set when uring wants to cancel a previously issued command */ + IO_URING_F_CANCEL = (1 << 11), + IO_URING_F_COMPAT = (1 << 12), +}; + struct io_wq_work_node { struct io_wq_work_node *next; }; -- cgit v1.2.3 From 6b04a3737057ddfed396c954f9e4be4fe6d53c62 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 1 Dec 2023 00:57:36 +0000 Subject: io_uring/cmd: inline io_uring_cmd_do_in_task_lazy Now as we can easily include io_uring_types.h, move IOU_F_TWQ_LAZY_WAKE and inline io_uring_cmd_do_in_task_lazy(). Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/2ec9fb31dd192d1c5cf26d0a2dec5657d88a8e48.1701391955.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 31 ++++++++++++++++--------------- include/linux/io_uring_types.h | 11 +++++++++++ 2 files changed, 27 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 62fcfaf6fcc98..ee9b3bc3a4af8 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -36,15 +36,6 @@ void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, void (*task_work_cb)(struct io_uring_cmd *, unsigned), unsigned flags); -/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */ -void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)); - -static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) -{ - __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); -} void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, unsigned int issue_flags); @@ -60,12 +51,9 @@ static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t ret2, unsigned issue_flags) { } -static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) -{ -} -static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned), + unsigned flags) { } static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, @@ -78,4 +66,17 @@ static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd } #endif +/* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ +static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ + __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); +} + +static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ + __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); +} + #endif /* _LINUX_IO_URING_CMD_H */ diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 8c807bcc8b2b0..bebab36abce89 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -7,6 +7,17 @@ #include #include +enum { + /* + * A hint to not wake right away but delay until there are enough of + * tw's queued to match the number of CQEs the task is waiting for. + * + * Must not be used wirh requests generating more than one CQE. + * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. + */ + IOU_F_TWQ_LAZY_WAKE = 1, +}; + enum io_uring_cmd_flags { IO_URING_F_COMPLETE_DEFER = 1, IO_URING_F_UNLOCKED = 2, -- cgit v1.2.3 From 055c15626a45b1ebc9f2f34981e705e1af171236 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 1 Dec 2023 00:57:37 +0000 Subject: io_uring/cmd: inline io_uring_cmd_get_task With io_uring_types.h we see all required definitions to inline io_uring_cmd_get_task(). Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/aa8e317f09e651a5f3e72f8c0ad3902084c1f930.1701391955.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index ee9b3bc3a4af8..d69b4038aa3e5 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -39,7 +39,6 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, unsigned int issue_flags); -struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd); #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, @@ -60,10 +59,6 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, unsigned int issue_flags) { } -static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) -{ - return NULL; -} #endif /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ @@ -79,4 +74,9 @@ static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); } +static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) +{ + return cmd_to_io_kiocb(cmd)->task; +} + #endif /* _LINUX_IO_URING_CMD_H */ -- cgit v1.2.3 From 0f292086c22b43202daffc14b585d3b54b9a1206 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 12 Dec 2023 11:44:36 +0200 Subject: splice: return type ssize_t from all helpers Not sure why some splice helpers return long, maybe historic reasons. Change them all to return ssize_t to conform to the splice methods and to the rest of the helpers. Suggested-by: Christian Brauner Link: https://lore.kernel.org/r/20231208-horchen-helium-d3ec1535ede5@brauner/ Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231212094440.250945-2-amir73il@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/splice.h | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/splice.h b/include/linux/splice.h index 49532d5dda523..068a8e8ffd732 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -68,31 +68,30 @@ typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, typedef int (splice_direct_actor)(struct pipe_inode_info *, struct splice_desc *); -extern ssize_t splice_from_pipe(struct pipe_inode_info *, struct file *, - loff_t *, size_t, unsigned int, - splice_actor *); -extern ssize_t __splice_from_pipe(struct pipe_inode_info *, - struct splice_desc *, splice_actor *); -extern ssize_t splice_to_pipe(struct pipe_inode_info *, - struct splice_pipe_desc *); -extern ssize_t add_to_pipe(struct pipe_inode_info *, - struct pipe_buffer *); -long vfs_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags); +ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags, + splice_actor *actor); +ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, + struct splice_desc *sd, splice_actor *actor); +ssize_t splice_to_pipe(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd); +ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf); +ssize_t vfs_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); ssize_t splice_direct_to_actor(struct file *file, struct splice_desc *sd, splice_direct_actor *actor); -long do_splice(struct file *in, loff_t *off_in, struct file *out, - loff_t *off_out, size_t len, unsigned int flags); -long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - loff_t *opos, size_t len, unsigned int flags); -long splice_file_range(struct file *in, loff_t *ppos, struct file *out, - loff_t *opos, size_t len); +ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out, + loff_t *off_out, size_t len, unsigned int flags); +ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags); +ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len); -extern long do_tee(struct file *in, struct file *out, size_t len, - unsigned int flags); -extern ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags); +ssize_t do_tee(struct file *in, struct file *out, size_t len, + unsigned int flags); +ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags); /* * for dynamic pipe sizing -- cgit v1.2.3 From 705bcfcbde38b9dd4db00cd3deb0b98bddb0dd4a Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 12 Dec 2023 11:44:37 +0200 Subject: fs: use splice_copy_file_range() inline helper generic_copy_file_range() is just a wrapper around splice_file_range(), which caps the maximum copy length. The only caller of splice_file_range(), namely __ceph_copy_file_range() is already ready to cope with short copy. Move the length capping into splice_file_range() and replace the exported symbol generic_copy_file_range() with a simple inline helper. Suggested-by: Christoph Hellwig Link: https://lore.kernel.org/linux-fsdevel/20231204083849.GC32438@lst.de/ Reviewed-by: Jan Kara Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231212094440.250945-3-amir73il@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 3 --- include/linux/splice.h | 7 +++++++ 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 04422a0eccddd..900d0cd55b50f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2090,9 +2090,6 @@ extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); -extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - size_t len, unsigned int flags); int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags, diff --git a/include/linux/splice.h b/include/linux/splice.h index 068a8e8ffd732..9dec4861d09f6 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -88,6 +88,13 @@ ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out, ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len); +static inline long splice_copy_file_range(struct file *in, loff_t pos_in, + struct file *out, loff_t pos_out, + size_t len) +{ + return splice_file_range(in, &pos_in, out, &pos_out, len); +} + ssize_t do_tee(struct file *in, struct file *out, size_t len, unsigned int flags); ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, -- cgit v1.2.3 From 36e28c42187c95eb148873ffb059bfdcb8cdb75b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 12 Dec 2023 11:44:38 +0200 Subject: fsnotify: split fsnotify_perm() into two hooks We would like to make changes to the fsnotify access permission hook - add file range arguments and add the pre modify event. In preparation for these changes, split the fsnotify_perm() hook into fsnotify_open_perm() and fsnotify_file_perm(). This is needed for fanotify "pre content" events. Reviewed-by: Josef Bacik Reviewed-by: Jan Kara Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231212094440.250945-4-amir73il@gmail.com Signed-off-by: Christian Brauner --- include/linux/fsnotify.h | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index bcb6609b54b30..926bb4461b9e6 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -100,29 +100,33 @@ static inline int fsnotify_file(struct file *file, __u32 mask) return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); } -/* Simple call site for access decisions */ -static inline int fsnotify_perm(struct file *file, int mask) +/* + * fsnotify_file_perm - permission hook before file access + */ +static inline int fsnotify_file_perm(struct file *file, int perm_mask) { - int ret; - __u32 fsnotify_mask = 0; + __u32 fsnotify_mask = FS_ACCESS_PERM; - if (!(mask & (MAY_READ | MAY_OPEN))) + if (!(perm_mask & MAY_READ)) return 0; - if (mask & MAY_OPEN) { - fsnotify_mask = FS_OPEN_PERM; + return fsnotify_file(file, fsnotify_mask); +} - if (file->f_flags & __FMODE_EXEC) { - ret = fsnotify_file(file, FS_OPEN_EXEC_PERM); +/* + * fsnotify_open_perm - permission hook before file open + */ +static inline int fsnotify_open_perm(struct file *file) +{ + int ret; - if (ret) - return ret; - } - } else if (mask & MAY_READ) { - fsnotify_mask = FS_ACCESS_PERM; + if (file->f_flags & __FMODE_EXEC) { + ret = fsnotify_file(file, FS_OPEN_EXEC_PERM); + if (ret) + return ret; } - return fsnotify_file(file, fsnotify_mask); + return fsnotify_file(file, FS_OPEN_PERM); } /* -- cgit v1.2.3 From cb383f06686734ef04daf63a4369566800717b7b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 12 Dec 2023 11:44:39 +0200 Subject: fsnotify: assert that file_start_write() is not held in permission hooks filesystem may be modified in the context of fanotify permission events (e.g. by HSM service), so assert that sb freeze protection is not held. If the assertion fails, then the following deadlock would be possible: CPU0 CPU1 CPU2 ------------------------------------------------------------------------- file_start_write()#0 ... fsnotify_perm() fanotify_get_response() => (read event and fill file) ... ... freeze_super() ... sb_wait_write() ... vfs_write() file_start_write()#1 This example demonstrates a use case of an hierarchical storage management (HSM) service that uses fanotify permission events to fill the content of a file before access, while a 3rd process starts fsfreeze. This creates a circular dependeny: file_start_write()#0 => fanotify_get_response => file_start_write()#1 => sb_wait_write() => file_end_write()#0 Where file_end_write()#0 can never be called and none of the threads can make progress. The assertion is checked for both MAY_READ and MAY_WRITE permission hooks in preparation for a pre-modify permission event. The assertion is not checked for an open permission event, because do_open() takes mnt_want_write() in O_TRUNC case, meaning that it is not safe to write to filesystem in the content of an open permission event. Reviewed-by: Josef Bacik Reviewed-by: Jan Kara Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231212094440.250945-5-amir73il@gmail.com Signed-off-by: Christian Brauner --- include/linux/fsnotify.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 926bb4461b9e6..0a9d6a8a747ac 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -107,6 +107,13 @@ static inline int fsnotify_file_perm(struct file *file, int perm_mask) { __u32 fsnotify_mask = FS_ACCESS_PERM; + /* + * filesystem may be modified in the context of permission events + * (e.g. by HSM filling a file on access), so sb freeze protection + * must not be held. + */ + lockdep_assert_once(file_write_not_started(file)); + if (!(perm_mask & MAY_READ)) return 0; -- cgit v1.2.3 From d9e5d31084b024734e64307521414ef0ae1d5333 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 12 Dec 2023 11:44:40 +0200 Subject: fsnotify: optionally pass access range in file permission hooks In preparation for pre-content permission events with file access range, move fsnotify_file_perm() hook out of security_file_permission() and into the callers. Callers that have the access range information call the new hook fsnotify_file_area_perm() with the access range. Reviewed-by: Jan Kara Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20231212094440.250945-6-amir73il@gmail.com Signed-off-by: Christian Brauner --- include/linux/fsnotify.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 0a9d6a8a747ac..11e6434b8e714 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -101,9 +101,10 @@ static inline int fsnotify_file(struct file *file, __u32 mask) } /* - * fsnotify_file_perm - permission hook before file access + * fsnotify_file_area_perm - permission hook before access to file range */ -static inline int fsnotify_file_perm(struct file *file, int perm_mask) +static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, + const loff_t *ppos, size_t count) { __u32 fsnotify_mask = FS_ACCESS_PERM; @@ -120,6 +121,14 @@ static inline int fsnotify_file_perm(struct file *file, int perm_mask) return fsnotify_file(file, fsnotify_mask); } +/* + * fsnotify_file_perm - permission hook before file access + */ +static inline int fsnotify_file_perm(struct file *file, int perm_mask) +{ + return fsnotify_file_area_perm(file, perm_mask, NULL, 0); +} + /* * fsnotify_open_perm - permission hook before file open */ -- cgit v1.2.3 From bf857ddd21d0bffc1edafc317e8e2ce0d6d5950c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:20 -0400 Subject: maple_tree: move debug check to __mas_set_range() __mas_set_range() was created to shortcut resetting the maple state and a debug check was added to the caller (the vma iterator) to ensure the internal maple state remains safe to use. Move the debug check from the vma iterator into the maple tree itself so other users do not incorrectly use the advanced maple state modification. Fallout from this change include a large amount of debug setup needed to be moved to earlier in the header, and the maple_tree.h radix-tree test code needed to move the inclusion of the header to after the atomic define. None of those changes have functional changes. Link: https://lkml.kernel.org/r/20231101171629.3612299-4-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 255 +++++++++++++++++++++++---------------------- 1 file changed, 129 insertions(+), 126 deletions(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index a452dd8a1e5c2..b5d5992578c91 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -557,6 +557,131 @@ static inline void mas_reset(struct ma_state *mas) */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) + +#ifdef CONFIG_DEBUG_MAPLE_TREE +enum mt_dump_format { + mt_dump_dec, + mt_dump_hex, +}; + +extern atomic_t maple_tree_tests_run; +extern atomic_t maple_tree_tests_passed; + +void mt_dump(const struct maple_tree *mt, enum mt_dump_format format); +void mas_dump(const struct ma_state *mas); +void mas_wr_dump(const struct ma_wr_state *wr_mas); +void mt_validate(struct maple_tree *mt); +void mt_cache_shrink(void); +#define MT_BUG_ON(__tree, __x) do { \ + atomic_inc(&maple_tree_tests_run); \ + if (__x) { \ + pr_info("BUG at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mt_dump(__tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ +} while (0) + +#define MAS_BUG_ON(__mas, __x) do { \ + atomic_inc(&maple_tree_tests_run); \ + if (__x) { \ + pr_info("BUG at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_dump(__mas); \ + mt_dump((__mas)->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ +} while (0) + +#define MAS_WR_BUG_ON(__wrmas, __x) do { \ + atomic_inc(&maple_tree_tests_run); \ + if (__x) { \ + pr_info("BUG at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_wr_dump(__wrmas); \ + mas_dump((__wrmas)->mas); \ + mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ +} while (0) + +#define MT_WARN_ON(__tree, __x) ({ \ + int ret = !!(__x); \ + atomic_inc(&maple_tree_tests_run); \ + if (ret) { \ + pr_info("WARN at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mt_dump(__tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ + unlikely(ret); \ +}) + +#define MAS_WARN_ON(__mas, __x) ({ \ + int ret = !!(__x); \ + atomic_inc(&maple_tree_tests_run); \ + if (ret) { \ + pr_info("WARN at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_dump(__mas); \ + mt_dump((__mas)->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ + unlikely(ret); \ +}) + +#define MAS_WR_WARN_ON(__wrmas, __x) ({ \ + int ret = !!(__x); \ + atomic_inc(&maple_tree_tests_run); \ + if (ret) { \ + pr_info("WARN at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mas_wr_dump(__wrmas); \ + mas_dump((__wrmas)->mas); \ + mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ + unlikely(ret); \ +}) +#else +#define MT_BUG_ON(__tree, __x) BUG_ON(__x) +#define MAS_BUG_ON(__mas, __x) BUG_ON(__x) +#define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x) +#define MT_WARN_ON(__tree, __x) WARN_ON(__x) +#define MAS_WARN_ON(__mas, __x) WARN_ON(__x) +#define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x) +#endif /* CONFIG_DEBUG_MAPLE_TREE */ + /** * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the * current location. @@ -570,6 +695,9 @@ static inline void mas_reset(struct ma_state *mas) static inline void __mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { + /* Ensure the range starts within the current slot */ + MAS_WARN_ON(mas, mas_is_active(mas) && + (mas->index > start || mas->last < start)); mas->index = start; mas->last = last; } @@ -587,8 +715,8 @@ static inline void __mas_set_range(struct ma_state *mas, unsigned long start, static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { - __mas_set_range(mas, start, last); mas->node = MAS_START; + __mas_set_range(mas, start, last); } /** @@ -713,129 +841,4 @@ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); for (__entry = mt_find(__tree, &(__index), __max); \ __entry; __entry = mt_find_after(__tree, &(__index), __max)) - -#ifdef CONFIG_DEBUG_MAPLE_TREE -enum mt_dump_format { - mt_dump_dec, - mt_dump_hex, -}; - -extern atomic_t maple_tree_tests_run; -extern atomic_t maple_tree_tests_passed; - -void mt_dump(const struct maple_tree *mt, enum mt_dump_format format); -void mas_dump(const struct ma_state *mas); -void mas_wr_dump(const struct ma_wr_state *wr_mas); -void mt_validate(struct maple_tree *mt); -void mt_cache_shrink(void); -#define MT_BUG_ON(__tree, __x) do { \ - atomic_inc(&maple_tree_tests_run); \ - if (__x) { \ - pr_info("BUG at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mt_dump(__tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ -} while (0) - -#define MAS_BUG_ON(__mas, __x) do { \ - atomic_inc(&maple_tree_tests_run); \ - if (__x) { \ - pr_info("BUG at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mas_dump(__mas); \ - mt_dump((__mas)->tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ -} while (0) - -#define MAS_WR_BUG_ON(__wrmas, __x) do { \ - atomic_inc(&maple_tree_tests_run); \ - if (__x) { \ - pr_info("BUG at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mas_wr_dump(__wrmas); \ - mas_dump((__wrmas)->mas); \ - mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ -} while (0) - -#define MT_WARN_ON(__tree, __x) ({ \ - int ret = !!(__x); \ - atomic_inc(&maple_tree_tests_run); \ - if (ret) { \ - pr_info("WARN at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mt_dump(__tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ - unlikely(ret); \ -}) - -#define MAS_WARN_ON(__mas, __x) ({ \ - int ret = !!(__x); \ - atomic_inc(&maple_tree_tests_run); \ - if (ret) { \ - pr_info("WARN at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mas_dump(__mas); \ - mt_dump((__mas)->tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ - unlikely(ret); \ -}) - -#define MAS_WR_WARN_ON(__wrmas, __x) ({ \ - int ret = !!(__x); \ - atomic_inc(&maple_tree_tests_run); \ - if (ret) { \ - pr_info("WARN at %s:%d (%u)\n", \ - __func__, __LINE__, __x); \ - mas_wr_dump(__wrmas); \ - mas_dump((__wrmas)->mas); \ - mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ - pr_info("Pass: %u Run:%u\n", \ - atomic_read(&maple_tree_tests_passed), \ - atomic_read(&maple_tree_tests_run)); \ - dump_stack(); \ - } else { \ - atomic_inc(&maple_tree_tests_passed); \ - } \ - unlikely(ret); \ -}) -#else -#define MT_BUG_ON(__tree, __x) BUG_ON(__x) -#define MAS_BUG_ON(__mas, __x) BUG_ON(__x) -#define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x) -#define MT_WARN_ON(__tree, __x) WARN_ON(__x) -#define MAS_WARN_ON(__mas, __x) WARN_ON(__x) -#define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x) -#endif /* CONFIG_DEBUG_MAPLE_TREE */ - #endif /*_LINUX_MAPLE_TREE_H */ -- cgit v1.2.3 From 31c532a8af57513228c2b12d281104198ff412b8 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:21 -0400 Subject: maple_tree: add end of node tracking to the maple state Analysis of the mas_for_each() iteration showed that there is a significant time spent finding the end of a node. This time can be greatly reduced if the end of the node is cached in the maple state. Care must be taken to update & invalidate as necessary. Link: https://lkml.kernel.org/r/20231101171629.3612299-5-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index b5d5992578c91..0b82efe0cf1ea 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -393,6 +393,7 @@ struct ma_state { unsigned char depth; /* depth of tree descent during write */ unsigned char offset; unsigned char mas_flags; + unsigned char end; /* The end of the node */ }; struct ma_wr_state { -- cgit v1.2.3 From 067311d33e650adfe7ae23765959ddcc1ba18510 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:25 -0400 Subject: maple_tree: separate ma_state node from status The maple tree node is overloaded to keep status as well as the active node. This, unfortunately, results in a re-walk on underflow or overflow. Since the maple state has room, the status can be placed in its own enum in the structure. Once an underflow/overflow is detected, certain modes can restore the status to active and others may need to re-walk just that one node to see the entry. The status being an enum has the benefit of detecting unhandled status in switch statements. [Liam.Howlett@oracle.com: fix comments about MAS_*] Link: https://lkml.kernel.org/r/20231106154124.614247-1-Liam.Howlett@oracle.com [Liam.Howlett@oracle.com: update forking to separate maple state and node] Link: https://lkml.kernel.org/r/20231106154551.615042-1-Liam.Howlett@oracle.com [Liam.Howlett@oracle.com: fix mas_prev() state separation code] Link: https://lkml.kernel.org/r/20231207193319.4025462-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20231101171629.3612299-9-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 87 ++++++++++++++++++++++++++-------------------- include/linux/mm_types.h | 3 +- 2 files changed, 52 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 0b82efe0cf1ea..4dd668f7b111b 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -349,6 +349,36 @@ static inline bool mtree_empty(const struct maple_tree *mt) /* Advanced API */ +/* + * Maple State Status + * ma_active means the maple state is pointing to a node and offset and can + * continue operating on the tree. + * ma_start means we have not searched the tree. + * ma_root means we have searched the tree and the entry we found lives in + * the root of the tree (ie it has index 0, length 1 and is the only entry in + * the tree). + * ma_none means we have searched the tree and there is no node in the + * tree for this entry. For example, we searched for index 1 in an empty + * tree. Or we have a tree which points to a full leaf node and we + * searched for an entry which is larger than can be contained in that + * leaf node. + * ma_pause means the data within the maple state may be stale, restart the + * operation + * ma_overflow means the search has reached the upper limit of the search + * ma_underflow means the search has reached the lower limit of the search + * ma_error means there was an error, check the node for the error number. + */ +enum maple_status { + ma_active, + ma_start, + ma_root, + ma_none, + ma_pause, + ma_overflow, + ma_underflow, + ma_error, +}; + /* * The maple state is defined in the struct ma_state and is used to keep track * of information during operations, and even between operations when using the @@ -381,6 +411,13 @@ static inline bool mtree_empty(const struct maple_tree *mt) * When returning a value the maple state index and last respectively contain * the start and end of the range for the entry. Ranges are inclusive in the * Maple Tree. + * + * The status of the state is used to determine how the next action should treat + * the state. For instance, if the status is ma_start then the next action + * should start at the root of the tree and walk down. If the status is + * ma_pause then the node may be stale data and should be discarded. If the + * status is ma_overflow, then the last action hit the upper limit. + * */ struct ma_state { struct maple_tree *tree; /* The tree we're operating in */ @@ -390,6 +427,7 @@ struct ma_state { unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ struct maple_alloc *alloc; /* Allocated nodes for this operation */ + enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ unsigned char offset; unsigned char mas_flags; @@ -416,28 +454,12 @@ struct ma_wr_state { spin_lock_nested(&((mas)->tree->ma_lock), subclass) #define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) - /* * Special values for ma_state.node. - * MAS_START means we have not searched the tree. - * MAS_ROOT means we have searched the tree and the entry we found lives in - * the root of the tree (ie it has index 0, length 1 and is the only entry in - * the tree). - * MAS_NONE means we have searched the tree and there is no node in the - * tree for this entry. For example, we searched for index 1 in an empty - * tree. Or we have a tree which points to a full leaf node and we - * searched for an entry which is larger than can be contained in that - * leaf node. * MA_ERROR represents an errno. After dropping the lock and attempting * to resolve the error, the walk would have to be restarted from the * top of the tree as the tree may have been modified. */ -#define MAS_START ((struct maple_enode *)1UL) -#define MAS_ROOT ((struct maple_enode *)5UL) -#define MAS_NONE ((struct maple_enode *)9UL) -#define MAS_PAUSE ((struct maple_enode *)17UL) -#define MAS_OVERFLOW ((struct maple_enode *)33UL) -#define MAS_UNDERFLOW ((struct maple_enode *)65UL) #define MA_ERROR(err) \ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) @@ -446,7 +468,8 @@ struct ma_wr_state { .tree = mt, \ .index = first, \ .last = end, \ - .node = MAS_START, \ + .node = NULL, \ + .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ .alloc = NULL, \ @@ -477,7 +500,6 @@ void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); void *mas_find_range_rev(struct ma_state *mas, unsigned long max); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); -bool mas_is_err(struct ma_state *mas); bool mas_nomem(struct ma_state *mas, gfp_t gfp); void mas_pause(struct ma_state *mas); @@ -506,28 +528,18 @@ static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, mas->tree = tree; mas->index = mas->last = addr; mas->max = ULONG_MAX; - mas->node = MAS_START; + mas->status = ma_start; + mas->node = NULL; } -/* Checks if a mas has not found anything */ -static inline bool mas_is_none(const struct ma_state *mas) -{ - return mas->node == MAS_NONE; -} - -/* Checks if a mas has been paused */ -static inline bool mas_is_paused(const struct ma_state *mas) +static inline bool mas_is_active(struct ma_state *mas) { - return mas->node == MAS_PAUSE; + return mas->status == ma_active; } -/* Check if the mas is pointing to a node or not */ -static inline bool mas_is_active(struct ma_state *mas) +static inline bool mas_is_err(struct ma_state *mas) { - if ((unsigned long)mas->node >= MAPLE_RESERVED_RANGE) - return true; - - return false; + return mas->status == ma_error; } /** @@ -540,9 +552,10 @@ static inline bool mas_is_active(struct ma_state *mas) * * Context: Any context. */ -static inline void mas_reset(struct ma_state *mas) +static __always_inline void mas_reset(struct ma_state *mas) { - mas->node = MAS_START; + mas->status = ma_start; + mas->node = NULL; } /** @@ -716,7 +729,7 @@ static inline void __mas_set_range(struct ma_state *mas, unsigned long start, static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { - mas->node = MAS_START; + mas_reset(mas); __mas_set_range(mas, start, last); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ef18d2b253788..a66534c78c4dd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1071,7 +1071,8 @@ struct vma_iterator { .mas = { \ .tree = &(__mm)->mm_mt, \ .index = __addr, \ - .node = MAS_START, \ + .node = NULL, \ + .status = ma_start, \ }, \ } -- cgit v1.2.3 From 0de56e38b307b0cb2ac825e8e7cb371a28daf844 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 1 Nov 2023 13:16:27 -0400 Subject: maple_tree: use maple state end for write operations ma_wr_state was previously tracking the end of the node for writing. Since the implementation of the ma_state end tracking, this is duplicated work. This patch removes the maple write state tracking of the end of the node and uses the maple state end instead. Link: https://lkml.kernel.org/r/20231101171629.3612299-11-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 4dd668f7b111b..b3d63123b945b 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -441,7 +441,6 @@ struct ma_wr_state { unsigned long r_max; /* range max */ enum maple_type type; /* mas->node type */ unsigned char offset_end; /* The offset where the write ends */ - unsigned char node_end; /* mas->node end */ unsigned long *pivots; /* mas->node->pivots pointer */ unsigned long end_piv; /* The pivot at the offset end */ void __rcu **slots; /* mas->node->slots pointer */ -- cgit v1.2.3 From 0a97c01cd20bb96359d8c9dedad92a061ed34e0b Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Thu, 30 Nov 2023 11:40:18 -0800 Subject: list_lru: allow explicit memcg and NUMA node selection Patch series "workload-specific and memory pressure-driven zswap writeback", v8. There are currently several issues with zswap writeback: 1. There is only a single global LRU for zswap, making it impossible to perform worload-specific shrinking - an memcg under memory pressure cannot determine which pages in the pool it owns, and often ends up writing pages from other memcgs. This issue has been previously observed in practice and mitigated by simply disabling memcg-initiated shrinking: https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u But this solution leaves a lot to be desired, as we still do not have an avenue for an memcg to free up its own memory locked up in the zswap pool. 2. We only shrink the zswap pool when the user-defined limit is hit. This means that if we set the limit too high, cold data that are unlikely to be used again will reside in the pool, wasting precious memory. It is hard to predict how much zswap space will be needed ahead of time, as this depends on the workload (specifically, on factors such as memory access patterns and compressibility of the memory pages). This patch series solves these issues by separating the global zswap LRU into per-memcg and per-NUMA LRUs, and performs workload-specific (i.e memcg- and NUMA-aware) zswap writeback under memory pressure. The new shrinker does not have any parameter that must be tuned by the user, and can be opted in or out on a per-memcg basis. As a proof of concept, we ran the following synthetic benchmark: build the linux kernel in a memory-limited cgroup, and allocate some cold data in tmpfs to see if the shrinker could write them out and improved the overall performance. Depending on the amount of cold data generated, we observe from 14% to 35% reduction in kernel CPU time used in the kernel builds. This patch (of 6): The interface of list_lru is based on the assumption that the list node and the data it represents belong to the same allocated on the correct node/memcg. While this assumption is valid for existing slab objects LRU such as dentries and inodes, it is undocumented, and rather inflexible for certain potential list_lru users (such as the upcoming zswap shrinker and the THP shrinker). It has caused us a lot of issues during our development. This patch changes list_lru interface so that the caller must explicitly specify numa node and memcg when adding and removing objects. The old list_lru_add() and list_lru_del() are renamed to list_lru_add_obj() and list_lru_del_obj(), respectively. It also extends the list_lru API with a new function, list_lru_putback, which undoes a previous list_lru_isolate call. Unlike list_lru_add, it does not increment the LRU node count (as list_lru_isolate does not decrement the node count). list_lru_putback also allows for explicit memcg and NUMA node selection. Link: https://lkml.kernel.org/r/20231130194023.4102148-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231130194023.4102148-2-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: Johannes Weiner Acked-by: Johannes Weiner Tested-by: Bagas Sanjaya Cc: Chris Li Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 54 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index db86ad78d428a..7675a48a07010 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -75,6 +75,8 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren * list_lru_add: add an element to the lru list's tail * @lru: the lru pointer * @item: the item to be added. + * @nid: the node id of the sublist to add the item to. + * @memcg: the cgroup of the sublist to add the item to. * * If the element is already part of a list, this function returns doing * nothing. Therefore the caller does not need to keep state about whether or @@ -87,12 +89,28 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren * * Return: true if the list was updated, false otherwise */ -bool list_lru_add(struct list_lru *lru, struct list_head *item); +bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg); /** - * list_lru_del: delete an element to the lru list + * list_lru_add_obj: add an element to the lru list's tail + * @lru: the lru pointer + * @item: the item to be added. + * + * This function is similar to list_lru_add(), but the NUMA node and the + * memcg of the sublist is determined by @item list_head. This assumption is + * valid for slab objects LRU such as dentries, inodes, etc. + * + * Return value: true if the list was updated, false otherwise + */ +bool list_lru_add_obj(struct list_lru *lru, struct list_head *item); + +/** + * list_lru_del: delete an element from the lru list * @lru: the lru pointer * @item: the item to be deleted. + * @nid: the node id of the sublist to delete the item from. + * @memcg: the cgroup of the sublist to delete the item from. * * This function works analogously as list_lru_add() in terms of list * manipulation. The comments about an element already pertaining to @@ -100,7 +118,21 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item); * * Return: true if the list was updated, false otherwise */ -bool list_lru_del(struct list_lru *lru, struct list_head *item); +bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg); + +/** + * list_lru_del_obj: delete an element from the lru list + * @lru: the lru pointer + * @item: the item to be deleted. + * + * This function is similar to list_lru_del(), but the NUMA node and the + * memcg of the sublist is determined by @item list_head. This assumption is + * valid for slab objects LRU such as dentries, inodes, etc. + * + * Return value: true if the list was updated, false otherwise. + */ +bool list_lru_del_obj(struct list_lru *lru, struct list_head *item); /** * list_lru_count_one: return the number of objects currently held by @lru @@ -138,6 +170,22 @@ static inline unsigned long list_lru_count(struct list_lru *lru) void list_lru_isolate(struct list_lru_one *list, struct list_head *item); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head); +/** + * list_lru_putback: undo list_lru_isolate + * @lru: the lru pointer. + * @item: the item to put back. + * @nid: the node id of the sublist to put the item back to. + * @memcg: the cgroup of the sublist to put the item back to. + * + * Put back an isolated item into its original LRU. Note that unlike + * list_lru_add, this does not increment the node LRU count (as + * list_lru_isolate does not originally decrement this count). + * + * Since we might have dropped the LRU lock in between, recompute list_lru_one + * from the node's id and memcg. + */ +void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid, + struct mem_cgroup *memcg); typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, struct list_lru_one *list, spinlock_t *lock, void *cb_arg); -- cgit v1.2.3 From fdc4161ff6a5e96222e159c1f1b28d31a985130d Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Thu, 30 Nov 2023 11:40:19 -0800 Subject: memcontrol: implement mem_cgroup_tryget_online() This patch implements a helper function that try to get a reference to an memcg's css, as well as checking if it is online. This new function is almost exactly the same as the existing mem_cgroup_tryget(), except for the onlineness check. In the !CONFIG_MEMCG case, it always returns true, analogous to mem_cgroup_tryget(). This is useful for e.g to the new zswap writeback scheme, where we need to select the next online memcg as a candidate for the global limit reclaim. Link: https://lkml.kernel.org/r/20231130194023.4102148-3-nphamcs@gmail.com Signed-off-by: Nhat Pham Tested-by: Bagas Sanjaya Reviewed-by: Yosry Ahmed Cc: Chris Li Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7bdcf3020d7a3..2bd7d14ace78a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -821,6 +821,11 @@ static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) return !memcg || css_tryget(&memcg->css); } +static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg) +{ + return !memcg || css_tryget_online(&memcg->css); +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { if (memcg) @@ -1349,6 +1354,11 @@ static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) return true; } +static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg) +{ + return true; +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } -- cgit v1.2.3 From a65b0e7607ccb5e5184591f73e48512f25c76061 Mon Sep 17 00:00:00 2001 From: Domenico Cerasuolo Date: Thu, 30 Nov 2023 11:40:20 -0800 Subject: zswap: make shrinking memcg-aware Currently, we only have a single global LRU for zswap. This makes it impossible to perform worload-specific shrinking - an memcg cannot determine which pages in the pool it owns, and often ends up writing pages from other memcgs. This issue has been previously observed in practice and mitigated by simply disabling memcg-initiated shrinking: https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u This patch fully resolves the issue by replacing the global zswap LRU with memcg- and NUMA-specific LRUs, and modify the reclaim logic: a) When a store attempt hits an memcg limit, it now triggers a synchronous reclaim attempt that, if successful, allows the new hotter page to be accepted by zswap. b) If the store attempt instead hits the global zswap limit, it will trigger an asynchronous reclaim attempt, in which an memcg is selected for reclaim in a round-robin-like fashion. [nphamcs@gmail.com: use correct function for the onlineness check, use mem_cgroup_iter_break()] Link: https://lkml.kernel.org/r/20231205195419.2563217-1-nphamcs@gmail.com [nphamcs@gmail.com: drop the pool's reference at the end of the writeback step] Link: https://lkml.kernel.org/r/20231206030627.4155634-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231130194023.4102148-4-nphamcs@gmail.com Signed-off-by: Domenico Cerasuolo Co-developed-by: Nhat Pham Signed-off-by: Nhat Pham Tested-by: Bagas Sanjaya Cc: Chris Li Cc: Dan Streetman Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 5 +++++ include/linux/zswap.h | 2 ++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2bd7d14ace78a..a308c8eacf20d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1192,6 +1192,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) return NULL; } +static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) +{ + return NULL; +} + static inline bool folio_memcg_kmem(struct folio *folio) { return false; diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 2a60ce39cfde1..e571e393669bb 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -15,6 +15,7 @@ bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); void zswap_swapon(int type); void zswap_swapoff(int type); +void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); #else @@ -31,6 +32,7 @@ static inline bool zswap_load(struct folio *folio) static inline void zswap_invalidate(int type, pgoff_t offset) {} static inline void zswap_swapon(int type) {} static inline void zswap_swapoff(int type) {} +static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} #endif -- cgit v1.2.3 From 7108cc3f765cafd48a6a35f8add140beaecfa75b Mon Sep 17 00:00:00 2001 From: Domenico Cerasuolo Date: Thu, 30 Nov 2023 11:40:21 -0800 Subject: mm: memcg: add per-memcg zswap writeback stat Since zswap now writes back pages from memcg-specific LRUs, we now need a new stat to show writebacks count for each memcg. [nphamcs@gmail.com: rename ZSWP_WB to ZSWPWB] Link: https://lkml.kernel.org/r/20231205193307.2432803-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231130194023.4102148-5-nphamcs@gmail.com Suggested-by: Nhat Pham Signed-off-by: Domenico Cerasuolo Signed-off-by: Nhat Pham Tested-by: Bagas Sanjaya Reviewed-by: Yosry Ahmed Cc: Chris Li Cc: Dan Streetman Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Signed-off-by: Andrew Morton --- include/linux/vm_event_item.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index d1b847502f09c..747943bc8cc2d 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -142,6 +142,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_ZSWAP ZSWPIN, ZSWPOUT, + ZSWPWB, #endif #ifdef CONFIG_X86 DIRECT_MAP_LEVEL2_SPLIT, -- cgit v1.2.3 From b5ba474f3f518701249598b35c581b92a3c95b48 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Thu, 30 Nov 2023 11:40:23 -0800 Subject: zswap: shrink zswap pool based on memory pressure Currently, we only shrink the zswap pool when the user-defined limit is hit. This means that if we set the limit too high, cold data that are unlikely to be used again will reside in the pool, wasting precious memory. It is hard to predict how much zswap space will be needed ahead of time, as this depends on the workload (specifically, on factors such as memory access patterns and compressibility of the memory pages). This patch implements a memcg- and NUMA-aware shrinker for zswap, that is initiated when there is memory pressure. The shrinker does not have any parameter that must be tuned by the user, and can be opted in or out on a per-memcg basis. Furthermore, to make it more robust for many workloads and prevent overshrinking (i.e evicting warm pages that might be refaulted into memory), we build in the following heuristics: * Estimate the number of warm pages residing in zswap, and attempt to protect this region of the zswap LRU. * Scale the number of freeable objects by an estimate of the memory saving factor. The better zswap compresses the data, the fewer pages we will evict to swap (as we will otherwise incur IO for relatively small memory saving). * During reclaim, if the shrinker encounters a page that is also being brought into memory, the shrinker will cautiously terminate its shrinking action, as this is a sign that it is touching the warmer region of the zswap LRU. As a proof of concept, we ran the following synthetic benchmark: build the linux kernel in a memory-limited cgroup, and allocate some cold data in tmpfs to see if the shrinker could write them out and improved the overall performance. Depending on the amount of cold data generated, we observe from 14% to 35% reduction in kernel CPU time used in the kernel builds. [nphamcs@gmail.com: check shrinker enablement early, use less costly stat flushing] Link: https://lkml.kernel.org/r/20231206194456.3234203-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231130194023.4102148-7-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Johannes Weiner Tested-by: Bagas Sanjaya Cc: Chris Li Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Vitaly Wool Cc: Yosry Ahmed Cc: Chengming Zhou Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 ++ include/linux/zswap.h | 25 +++++++++++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 14faffa4354f5..9ef9d010bff02 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -22,6 +22,7 @@ #include #include #include +#include #include /* Free memory management - zoned buddy allocator. */ @@ -641,6 +642,7 @@ struct lruvec { #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif + struct zswap_lruvec_state zswap_lruvec_state; }; /* Isolate for asynchronous migration */ diff --git a/include/linux/zswap.h b/include/linux/zswap.h index e571e393669bb..08c240e16a01f 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -5,20 +5,40 @@ #include #include +struct lruvec; + extern u64 zswap_pool_total_size; extern atomic_t zswap_stored_pages; #ifdef CONFIG_ZSWAP +struct zswap_lruvec_state { + /* + * Number of pages in zswap that should be protected from the shrinker. + * This number is an estimate of the following counts: + * + * a) Recent page faults. + * b) Recent insertion to the zswap LRU. This includes new zswap stores, + * as well as recent zswap LRU rotations. + * + * These pages are likely to be warm, and might incur IO if the are written + * to swap. + */ + atomic_long_t nr_zswap_protected; +}; + bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); void zswap_swapon(int type); void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); - +void zswap_lruvec_state_init(struct lruvec *lruvec); +void zswap_page_swapin(struct page *page); #else +struct zswap_lruvec_state {}; + static inline bool zswap_store(struct folio *folio) { return false; @@ -33,7 +53,8 @@ static inline void zswap_invalidate(int type, pgoff_t offset) {} static inline void zswap_swapon(int type) {} static inline void zswap_swapoff(int type) {} static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} - +static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {} +static inline void zswap_page_swapin(struct page *page) {} #endif #endif /* _LINUX_ZSWAP_H */ -- cgit v1.2.3 From 9294a037c01564786abb15436529fae3863268a2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 30 Nov 2023 02:36:44 +0000 Subject: mm/damon/core: implement goal-oriented feedback-driven quota auto-tuning Patch series "mm/damon: let users feed and tame/auto-tune DAMOS". Introduce Aim-oriented Feedback-driven DAMOS Aggressiveness Auto-tuning. It makes DAMOS self-tuned with periodic simple user feedback. Background: DAMOS Control Difficulty ==================================== DAMOS helps users easily implement access pattern aware system operations. However, controlling DAMOS in the wild is not that easy. The basic way for DAMOS control is specifying the target access pattern. In this approach, the user is assumed to well understand the access pattern and the characteristics of the system and the workloads. Though there are useful tools for that, it takes time and effort depending on the complexity and the dynamicity of the system and the workloads. After all, the access pattern consists of three ranges, namely the size, the access rate, and the age of the regions. It means users need to tune six parameters, which is anyway not a simple task. One of the worst cases would be DAMOS being too aggressive like a berserker, and therefore consuming too much system resource and making unwanted radical system operations. To let users avoid such cases, DAMOS allows users to set the upper-limit of the schemes' aggressiveness, namely DAMOS quota. DAMOS further provides its best-effort under the limit by prioritizing regions based on the access pattern of the regions. For example, users can ask DAMOS to page out up to 100 MiB of memory regions per second. Then DAMOS pages out regions that are not accessed for a longer time (colder) first under the limit. This allows users to set the target access pattern a bit naive with wider ranges, and focus on tuning only one parameter, the quota. In other words, the number of parameters to tune can be reduced from six to one. Still, however, the optimum value for the quota depends on the system and the workloads' characteristics, so not that simple. The number of parameters to tune can also increase again if the user needs to run multiple schemes. Aim-oriented Feedback-driven DAMOS Aggressiveness Auto Tuning ============================================================= Users would use DAMOS since they want to achieve something with it. They will likely have measurable metrics representing the achievement and the target number of the metric like SLO, and continuously measure that anyway. While the additional cost of getting the information is nearly zero, it could be useful for DAMOS to understand how appropriate its current aggressiveness is set, and adjust it on its own to make the metric value more close to the target. Based on this idea, we introduce a new way of tuning DAMOS with nearly zero additional effort, namely Aim-oriented Feedback-driven DAMOS Aggressiveness Auto Tuning. It asks users to provide feedback representing how well DAMOS is doing relative to the users' aim. Then DAMOS adjusts its aggressiveness, specifically the quota that provides the best effort result under the limit, based on the current level of the aggressiveness and the users' feedback. Implementation ============== The implementation asks users to represent the feedback with score numbers. The scores could be anything including user-space specific metrics including latency and throughput of special user-space workloads, and system metrics including free memory ratio, memory pressure stall time (PSI), and active to inactive LRU lists size ratio. The feedback scores and the aggressiveness of the given DAMOS scheme are assumed to be positively proportional, though. Selecting metrics of the assumption is the users' responsibility. The core logic uses the below simple feedback loop algorithm to calculate the next aggressiveness level of the scheme from the current aggressiveness level and the current feedback (target_score and current_score). It calculates the compensation for next aggressiveness as a proportion of current aggressiveness and distance to the target score. As a result, it arrives at the near-goal state in a short time using big steps when it's far from the goal, but avoids making unnecessarily radical changes that could turn out to be a bad decision using small steps when its near to the goal. f(n) = max(1, f(n - 1) * ((target_score - current_score) / target_score + 1)) Note that the compensation value becomes negative when it's over achieving the goal. That's why the feedback metric and the aggressiveness of the scheme should be positively proportional. The distance-adaptive speed manipulation is simply applied. Example Use Cases ================= If users want to reduce the memory footprint of the system as much as possible as long as the time spent for handling the resulting memory pressure is within a threshold, they could use DAMOS scheme that reclaims cold memory regions aiming for a little level of memory pressure stall time. If users want the active/inactive LRU lists well balanced to reduce the performance impact due to possible future memory pressure, they could use two schemes. The first one would be set to locate hot pages in the active LRU list, aiming for a specific active-to-inactive LRU list size ratio, say, 70%. The second one would be to locate cold pages in the inactive LRU list, aiming for a specific inactive-to-active LRU list size ratio, say, 30%. Then, DAMOS will balance the two schemes based on the goal and feedback. This aim-oriented auto tuning could also be useful for general balancing-required access aware system operations such as system memory auto scaling[3] and tiered memory management[4]. These two example usages are not what current DAMOS implementation is already supporting, but require additional DAMOS action developments, though. Evaluation: subtle memory pressure aiming proactive reclamation =============================================================== To show if the implementation works as expected, we prepare four different system configurations on AWS i3.metal instances. The first setup (original) runs the workload without any DAMOS scheme. The second setup (not-tuned) runs the workload with a virtual address space-based proactive reclamation scheme that pages out memory regions that are not accessed for five seconds or more. The third setup (offline-tuned) runs the same proactive reclamation DAMOS scheme, but after making it tuned for each workload offline, using our previous user-space driven automatic tuning approach, namely DAMOOS[1]. The fourth and final setup (AFDAA) runs the scheme that is the same as that of 'not-tuned' setup, but aims to keep 0.5% of 'some' memory pressure stall time (PSI) for the last 10 seconds using the aiming-oriented auto tuning. For each setup, we run realistic workloads from PARSEC3 and SPLASH-2X benchmark suites. For each run, we measure RSS and runtime of the workload, and 'some' memory pressure stall time (PSI) of the system. We repeat the runs five times and use averaged measurements. For simple comparison of the results, we normalize the measurements to those of 'original'. In the case of the PSI, though, the measurement for 'original' was zero, so we normalize the value to that of 'not-tuned' scheme's result. The normalized results are shown below. Not-tuned Offline-tuned AFDAA RSS 0.622688178226118 0.787950678944904 0.740093483278979 runtime 1.11767826657912 1.0564674983585 1.0910833880499 PSI 1 0.727521443794069 0.308498846350299 The 'not-tuned' scheme achieves about 38.7% memory saving but incur about 11.7% runtime slowdown. The 'offline-tuned' scheme achieves about 22.2% memory saving with about 5.5% runtime slowdown. It also achieves about 28.2% memory pressure stall time saving. AFDAA achieves about 26% memory saving with about 9.1% runtime slowdown. It also achieves about 69.1% memory pressure stall time saving. We repeat this test multiple times, and get consistent results. AFDAA is now integrated in our daily DAMON performance test setup. Apparently the aggressiveness of 'AFDAA' setup is somewhere between those of 'not-tuned' and 'offline-tuned' setup, since its memory saving and runtime overhead are between those of the other two setups. Actually we set the memory pressure stall time goal aiming for this middle aggressiveness. The difference in the two metrics are not significant, though. However, it shows significant saving of the memory pressure stall time, which was the goal of the auto-tuning, over the two variants. Hence, we conclude the automatic tuning is working as expected. Please note that the AFDAA setup is only for the evaluation, and therefore intentionally set a bit aggressive. It might not be appropriate for production environments. The test code is also available[2], so you could reproduce it on your system and workloads. Patches Sequence ================ The first four patches implement the core logic and user interfaces for the auto tuning. The first patch implements the core logic for the auto tuning, and the API for DAMOS users in the kernel space. The second patch implements basic file operations of DAMON sysfs directories and files that will be used for setting the goals and providing the feedback. The third patch connects the quota goals files inputs to the DAMOS core logic. Finally the fourth patch implements a dedicated DAMOS sysfs command for efficiently committing the quota goals feedback. Two patches for simple tests of the logic and interfaces follow. The fifth patch implements the core logic unit test. The sixth patch implements a selftest for the DAMON Sysfs interface for the goals. Finally, three patches for documentation follows. The seventh patch documents the design of the feature. The eighth patch updates the API doc for the new sysfs files. The final eighth patch updates the usage document for the features. References ========== [1] DAOS paper: https://www.amazon.science/publications/daos-data-access-aware-operating-system [2] Evaluation code: https://github.com/damonitor/damon-tests/commit/3f884e61193f0166b8724554b6d06b0c449a712d [3] Memory auto scaling RFC idea: https://lore.kernel.org/damon/20231112195114.61474-1-sj@kernel.org/ [4] DAMON-based tiered memory management RFC idea: https://lore.kernel.org/damon/20231112195602.61525-1-sj@kernel.org/ This patch (of 9) Users can effectively control the upper-limit aggressiveness of DAMOS schemes using the quota feature. The quota provides best result under the limit by prioritizing regions based on the access pattern. That said, finding the best value, which could depend on dynamic characteristics of the system and the workloads, is still challenging. Implement a simple feedback-driven tuning mechanism and use it for automatic tuning of DAMOS quota. The implementation allows users to provide the feedback by setting a feedback score returning callback function. Then DAMOS periodically calls the function back and adjusts the quota based on the return value of the callback and current quota value. Note that the absolute-value based time/size quotas still work as the maximum hard limits of the scheme's aggressiveness. The feedback-driven auto-tuned quota is applied only if it is not exceeding the manually set maximum limits. Same for the scheme-target access pattern and filters like other features. [sj@kernel.org: document get_score_arg field of struct damos_quota] Link: https://lkml.kernel.org/r/20231204170106.60992-1-sj@kernel.org Link: https://lkml.kernel.org/r/20231130023652.50284-1-sj@kernel.org Link: https://lkml.kernel.org/r/20231130023652.50284-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index ab2f17d9926b5..aa34ab433bc5c 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -136,6 +136,9 @@ enum damos_action { * @weight_nr_accesses: Weight of the region's nr_accesses for prioritization. * @weight_age: Weight of the region's age for prioritization. * + * @get_score: Feedback function for self-tuning quota. + * @get_score_arg: Parameter for @get_score + * * To avoid consuming too much CPU time or IO resources for applying the * &struct damos->action to large memory, DAMON allows users to set time and/or * size quotas. The quotas can be set by writing non-zero values to &ms and @@ -153,6 +156,17 @@ enum damos_action { * You could customize the prioritization logic by setting &weight_sz, * &weight_nr_accesses, and &weight_age, because monitoring operations are * encouraged to respect those. + * + * If @get_score function pointer is set, DAMON calls it back with + * @get_score_arg and get the return value of it for every @reset_interval. + * Then, DAMON adjusts the effective quota using the return value as a feedback + * score to the current quota, using its internal feedback loop algorithm. + * + * The feedback loop algorithem assumes the quota input and the feedback score + * output are in a positive proportional relationship, and the goal of the + * tuning is getting the feedback screo value of 10,000. If @ms and/or @sz are + * set together, those work as a hard limit quota. If neither @ms nor @sz are + * set, the mechanism starts from the quota of one byte. */ struct damos_quota { unsigned long ms; @@ -163,6 +177,9 @@ struct damos_quota { unsigned int weight_nr_accesses; unsigned int weight_age; + unsigned long (*get_score)(void *arg); + void *get_score_arg; + /* private: */ /* For throughput estimation */ unsigned long total_charged_sz; @@ -179,6 +196,9 @@ struct damos_quota { /* For prioritization */ unsigned long histogram[DAMOS_MAX_SCORE + 1]; unsigned int min_score; + + /* For feedback loop */ + unsigned long esz_bp; }; /** -- cgit v1.2.3 From 1486fb50136f4799946f5ecfe050094574647153 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sat, 18 Nov 2023 10:32:28 +0800 Subject: mm: ksm: use more folio api in ksm_might_need_to_copy() Patch series "mm: cleanup and use more folio in page fault", v3. Rename page_copy_prealloc() to folio_prealloc(), which is used by more functions, also do more folio conversion in page fault. This patch (of 5): Since ksm only support normal page, no swapout/in for ksm large folio too, add large folio check in ksm_might_need_to_copy(), also convert page->index to folio->index as page->index is going away. Then convert ksm_might_need_to_copy() to use more folio api to save nine compound_head() calls, short 'address' to reduce max-line-length. Link: https://lkml.kernel.org/r/20231118023232.1409103-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20231118023232.1409103-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Sidhartha Kumar Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/ksm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index c2dd786a30e1f..4643d5244e77c 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -77,7 +77,7 @@ static inline void ksm_exit(struct mm_struct *mm) * but what if the vma was unmerged while the page was swapped out? */ struct page *ksm_might_need_to_copy(struct page *page, - struct vm_area_struct *vma, unsigned long address); + struct vm_area_struct *vma, unsigned long addr); void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); @@ -130,7 +130,7 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, } static inline struct page *ksm_might_need_to_copy(struct page *page, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long addr) { return page; } -- cgit v1.2.3 From f67f8d4a8c1e1ebc85a6cbdb9a7266f14863461c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 1 Dec 2023 14:59:36 -0500 Subject: mm/rmap: fix misplaced parenthesis of a likely() Running my yearly branch profiler to see where likely/unlikely annotation may be added or removed, I discovered this: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 0 457918 100 page_try_dup_anon_rmap rmap.h 264 [..] 458021 0 0 page_try_dup_anon_rmap rmap.h 265 I thought it was interesting that line 264 of rmap.h had a 100% incorrect annotation, but the line directly below it was 100% correct. Looking at the code: if (likely(!is_device_private_page(page) && unlikely(page_needs_cow_for_dma(vma, page)))) It didn't make sense. The "likely()" was around the entire if statement (not just the "!is_device_private_page(page)"), which also included the "unlikely()" portion of that if condition. If the unlikely portion is unlikely to be true, that would make the entire if condition unlikely to be true, so it made no sense at all to say the entire if condition is true. What is more likely to be likely is just the first part of the if statement before the && operation. It's likely to be a misplaced parenthesis. And after making the if condition broken into a likely() && unlikely(), both now appear to be correct! Link: https://lkml.kernel.org/r/20231201145936.5ddfdb50@gandalf.local.home Fixes:fb3d824d1a46c ("mm/rmap: split page_dup_rmap() into page_dup_file_rmap() and page_try_dup_anon_rmap()") Signed-off-by: Steven Rostedt (Google) Acked-by: Vlastimil Babka Cc: David Hildenbrand Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/rmap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b26fe858fd444..3c2fc291b071d 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -261,8 +261,8 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound, * guarantee the pinned page won't be randomly replaced in the * future on write faults. */ - if (likely(!is_device_private_page(page) && - unlikely(page_needs_cow_for_dma(vma, page)))) + if (likely(!is_device_private_page(page)) && + unlikely(page_needs_cow_for_dma(vma, page))) return -EBUSY; ClearPageAnonExclusive(page); -- cgit v1.2.3 From 82c944d05b1a24c76948ee9d6bb1d7de1ebb8b3a Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Tue, 28 Nov 2023 14:25:30 +0100 Subject: net: wan: Add framer framework support A framer is a component in charge of an E1/T1 line interface. Connected usually to a TDM bus, it converts TDM frames to/from E1/T1 frames. It also provides information related to the E1/T1 line. The framer framework provides a set of APIs for the framer drivers (framer provider) to create/destroy a framer and APIs for the framer users (framer consumer) to obtain a reference to the framer, and use the framer. This basic implementation provides a framer abstraction for: - power on/off the framer - get the framer status (line state) - be notified on framer status changes - get/set the framer configuration Signed-off-by: Herve Codina Reviewed-by: Christophe Leroy Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20231128132534.258459-2-herve.codina@bootlin.com Signed-off-by: Linus Walleij --- include/linux/framer/framer-provider.h | 194 +++++++++++++++++++++++++++++++ include/linux/framer/framer.h | 205 +++++++++++++++++++++++++++++++++ 2 files changed, 399 insertions(+) create mode 100644 include/linux/framer/framer-provider.h create mode 100644 include/linux/framer/framer.h (limited to 'include/linux') diff --git a/include/linux/framer/framer-provider.h b/include/linux/framer/framer-provider.h new file mode 100644 index 0000000000000..782cd5fc83d54 --- /dev/null +++ b/include/linux/framer/framer-provider.h @@ -0,0 +1,194 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Generic framer profider header file + * + * Copyright 2023 CS GROUP France + * + * Author: Herve Codina + */ + +#ifndef __DRIVERS_PROVIDER_FRAMER_H +#define __DRIVERS_PROVIDER_FRAMER_H + +#include +#include +#include + +#define FRAMER_FLAG_POLL_STATUS BIT(0) + +/** + * struct framer_ops - set of function pointers for performing framer operations + * @init: operation to be performed for initializing the framer + * @exit: operation to be performed while exiting + * @power_on: powering on the framer + * @power_off: powering off the framer + * @flags: OR-ed flags (FRAMER_FLAG_*) to ask for core functionality + * - @FRAMER_FLAG_POLL_STATUS: + * Ask the core to perform a polling to get the framer status and + * notify consumers on change. + * The framer should call @framer_notify_status_change() when it + * detects a status change. This is usually done using interrupts. + * If the framer cannot detect this change, it can ask the core for + * a status polling. The core will call @get_status() periodically + * and, on change detected, it will notify the consumer. + * the @get_status() + * @owner: the module owner containing the ops + */ +struct framer_ops { + int (*init)(struct framer *framer); + void (*exit)(struct framer *framer); + int (*power_on)(struct framer *framer); + int (*power_off)(struct framer *framer); + + /** + * @get_status: + * + * Optional. + * + * Used to get the framer status. framer_init() must have + * been called on the framer. + * + * Returns: 0 if successful, an negative error code otherwise + */ + int (*get_status)(struct framer *framer, struct framer_status *status); + + /** + * @set_config: + * + * Optional. + * + * Used to set the framer configuration. framer_init() must have + * been called on the framer. + * + * Returns: 0 if successful, an negative error code otherwise + */ + int (*set_config)(struct framer *framer, const struct framer_config *config); + + /** + * @get_config: + * + * Optional. + * + * Used to get the framer configuration. framer_init() must have + * been called on the framer. + * + * Returns: 0 if successful, an negative error code otherwise + */ + int (*get_config)(struct framer *framer, struct framer_config *config); + + u32 flags; + struct module *owner; +}; + +/** + * struct framer_provider - represents the framer provider + * @dev: framer provider device + * @children: can be used to override the default (dev->of_node) child node + * @owner: the module owner having of_xlate + * @list: to maintain a linked list of framer providers + * @of_xlate: function pointer to obtain framer instance from framer pointer + */ +struct framer_provider { + struct device *dev; + struct module *owner; + struct list_head list; + struct framer * (*of_xlate)(struct device *dev, + struct of_phandle_args *args); +}; + +static inline void framer_set_drvdata(struct framer *framer, void *data) +{ + dev_set_drvdata(&framer->dev, data); +} + +static inline void *framer_get_drvdata(struct framer *framer) +{ + return dev_get_drvdata(&framer->dev); +} + +#if IS_ENABLED(CONFIG_GENERIC_FRAMER) + +/* Create and destroy a framer */ +struct framer *framer_create(struct device *dev, struct device_node *node, + const struct framer_ops *ops); +void framer_destroy(struct framer *framer); + +/* devm version */ +struct framer *devm_framer_create(struct device *dev, struct device_node *node, + const struct framer_ops *ops); + +struct framer *framer_provider_simple_of_xlate(struct device *dev, + struct of_phandle_args *args); + +struct framer_provider * +__framer_provider_of_register(struct device *dev, struct module *owner, + struct framer *(*of_xlate)(struct device *dev, + struct of_phandle_args *args)); + +void framer_provider_of_unregister(struct framer_provider *framer_provider); + +struct framer_provider * +__devm_framer_provider_of_register(struct device *dev, struct module *owner, + struct framer *(*of_xlate)(struct device *dev, + struct of_phandle_args *args)); + +void framer_notify_status_change(struct framer *framer); + +#else /* IS_ENABLED(CONFIG_GENERIC_FRAMER) */ + +static inline struct framer *framer_create(struct device *dev, struct device_node *node, + const struct framer_ops *ops) +{ + return ERR_PTR(-ENOSYS); +} + +static inline void framer_destroy(struct framer *framer) +{ +} + +/* devm version */ +static inline struct framer *devm_framer_create(struct device *dev, struct device_node *node, + const struct framer_ops *ops) +{ + return ERR_PTR(-ENOSYS); +} + +static inline struct framer *framer_provider_simple_of_xlate(struct device *dev, + struct of_phandle_args *args) +{ + return ERR_PTR(-ENOSYS); +} + +static inline struct framer_provider * +__framer_provider_of_register(struct device *dev, struct module *owner, + struct framer *(*of_xlate)(struct device *dev, + struct of_phandle_args *args)) +{ + return ERR_PTR(-ENOSYS); +} + +void framer_provider_of_unregister(struct framer_provider *framer_provider) +{ +} + +static inline struct framer_provider * +__devm_framer_provider_of_register(struct device *dev, struct module *owner, + struct framer *(*of_xlate)(struct device *dev, + struct of_phandle_args *args)) +{ + return ERR_PTR(-ENOSYS); +} + +void framer_notify_status_change(struct framer *framer) +{ +} + +#endif /* IS_ENABLED(CONFIG_GENERIC_FRAMER) */ + +#define framer_provider_of_register(dev, xlate) \ + __framer_provider_of_register((dev), THIS_MODULE, (xlate)) + +#define devm_framer_provider_of_register(dev, xlate) \ + __devm_framer_provider_of_register((dev), THIS_MODULE, (xlate)) + +#endif /* __DRIVERS_PROVIDER_FRAMER_H */ diff --git a/include/linux/framer/framer.h b/include/linux/framer/framer.h new file mode 100644 index 0000000000000..9a9b88962c296 --- /dev/null +++ b/include/linux/framer/framer.h @@ -0,0 +1,205 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Generic framer header file + * + * Copyright 2023 CS GROUP France + * + * Author: Herve Codina + */ + +#ifndef __DRIVERS_FRAMER_H +#define __DRIVERS_FRAMER_H + +#include +#include +#include +#include +#include +#include + +/** + * enum framer_iface - Framer interface + * @FRAMER_IFACE_E1: E1 interface + * @FRAMER_IFACE_T1: T1 interface + */ +enum framer_iface { + FRAMER_IFACE_E1, + FRAMER_IFACE_T1, +}; + +/** + * enum framer_clock_type - Framer clock type + * @FRAMER_CLOCK_EXT: External clock + * @FRAMER_CLOCK_INT: Internal clock + */ +enum framer_clock_type { + FRAMER_CLOCK_EXT, + FRAMER_CLOCK_INT, +}; + +/** + * struct framer_config - Framer configuration + * @iface: Framer line interface + * @clock_type: Framer clock type + * @line_clock_rate: Framer line clock rate + */ +struct framer_config { + enum framer_iface iface; + enum framer_clock_type clock_type; + unsigned long line_clock_rate; +}; + +/** + * struct framer_status - Framer status + * @link_is_on: Framer link state. true, the link is on, false, the link is off. + */ +struct framer_status { + bool link_is_on; +}; + +/** + * enum framer_event - Event available for notification + * @FRAMER_EVENT_STATUS: Event notified on framer_status changes + */ +enum framer_event { + FRAMER_EVENT_STATUS, +}; + +/** + * struct framer - represents the framer device + * @dev: framer device + * @id: id of the framer device + * @ops: function pointers for performing framer operations + * @mutex: mutex to protect framer_ops + * @init_count: used to protect when the framer is used by multiple consumers + * @power_count: used to protect when the framer is used by multiple consumers + * @pwr: power regulator associated with the framer + * @notify_status_work: work structure used for status notifications + * @notifier_list: notifier list used for notifications + * @polling_work: delayed work structure used for the polling task + * @prev_status: previous read status used by the polling task to detect changes + */ +struct framer { + struct device dev; + int id; + const struct framer_ops *ops; + struct mutex mutex; /* Protect framer */ + int init_count; + int power_count; + struct regulator *pwr; + struct work_struct notify_status_work; + struct blocking_notifier_head notifier_list; + struct delayed_work polling_work; + struct framer_status prev_status; +}; + +#if IS_ENABLED(CONFIG_GENERIC_FRAMER) +int framer_pm_runtime_get(struct framer *framer); +int framer_pm_runtime_get_sync(struct framer *framer); +int framer_pm_runtime_put(struct framer *framer); +int framer_pm_runtime_put_sync(struct framer *framer); +int framer_init(struct framer *framer); +int framer_exit(struct framer *framer); +int framer_power_on(struct framer *framer); +int framer_power_off(struct framer *framer); +int framer_get_status(struct framer *framer, struct framer_status *status); +int framer_get_config(struct framer *framer, struct framer_config *config); +int framer_set_config(struct framer *framer, const struct framer_config *config); +int framer_notifier_register(struct framer *framer, struct notifier_block *nb); +int framer_notifier_unregister(struct framer *framer, struct notifier_block *nb); + +struct framer *framer_get(struct device *dev, const char *con_id); +void framer_put(struct device *dev, struct framer *framer); + +struct framer *devm_framer_get(struct device *dev, const char *con_id); +struct framer *devm_framer_optional_get(struct device *dev, const char *con_id); +#else +static inline int framer_pm_runtime_get(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_pm_runtime_get_sync(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_pm_runtime_put(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_pm_runtime_put_sync(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_init(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_exit(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_power_on(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_power_off(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_get_status(struct framer *framer, struct framer_status *status) +{ + return -ENOSYS; +} + +static inline int framer_get_config(struct framer *framer, struct framer_config *config) +{ + return -ENOSYS; +} + +static inline int framer_set_config(struct framer *framer, const struct framer_config *config) +{ + return -ENOSYS; +} + +static inline int framer_notifier_register(struct framer *framer, + struct notifier_block *nb) +{ + return -ENOSYS; +} + +static inline int framer_notifier_unregister(struct framer *framer, + struct notifier_block *nb) +{ + return -ENOSYS; +} + +struct framer *framer_get(struct device *dev, const char *con_id) +{ + return ERR_PTR(-ENOSYS); +} + +void framer_put(struct device *dev, struct framer *framer) +{ +} + +static inline struct framer *devm_framer_get(struct device *dev, const char *con_id) +{ + return ERR_PTR(-ENOSYS); +} + +static inline struct framer *devm_framer_optional_get(struct device *dev, const char *con_id) +{ + return NULL; +} + +#endif + +#endif /* __DRIVERS_FRAMER_H */ -- cgit v1.2.3 From c96e976d9a05d559f4ac4f617ea0f798c75a1799 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Tue, 28 Nov 2023 14:25:32 +0100 Subject: net: wan: framer: Add support for the Lantiq PEF2256 framer The Lantiq PEF2256 is a framer and line interface component designed to fulfill all required interfacing between an analog E1/T1/J1 line and the digital PCM system highway/H.100 bus. Signed-off-by: Herve Codina Reviewed-by: Christophe Leroy Reviewed-by: Linus Walleij Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20231128132534.258459-4-herve.codina@bootlin.com Signed-off-by: Linus Walleij --- include/linux/framer/pef2256.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 include/linux/framer/pef2256.h (limited to 'include/linux') diff --git a/include/linux/framer/pef2256.h b/include/linux/framer/pef2256.h new file mode 100644 index 0000000000000..71d80af58c406 --- /dev/null +++ b/include/linux/framer/pef2256.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * PEF2256 consumer API + * + * Copyright 2023 CS GROUP France + * + * Author: Herve Codina + */ +#ifndef __PEF2256_H__ +#define __PEF2256_H__ + +#include + +struct pef2256; +struct regmap; + +/* Retrieve the PEF2256 regmap */ +struct regmap *pef2256_get_regmap(struct pef2256 *pef2256); + +/* PEF2256 hardware versions */ +enum pef2256_version { + PEF2256_VERSION_UNKNOWN, + PEF2256_VERSION_1_2, + PEF2256_VERSION_2_1, + PEF2256_VERSION_2_2, +}; + +/* Get the PEF2256 hardware version */ +enum pef2256_version pef2256_get_version(struct pef2256 *pef2256); + +#endif /* __PEF2256_H__ */ -- cgit v1.2.3 From d3bb89ea9c13e5a98d2b7a0ba8e50a77893132cb Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 7 Dec 2023 23:25:25 +0800 Subject: mm: fix VMA heap bounds checking After converting selinux to VMA heap check helper, the gcl triggers an execheap SELinux denial, which is caused by a changed logic check. Previously selinux only checked that the VMA range was within the VMA heap range, and the implementation checks the intersection between the two ranges, but the corner case (vm_end=start_brk, brk=vm_start) isn't handled correctly. Since commit 11250fd12eb8 ("mm: factor out VMA stack and heap checks") was only a function extraction, it seems that the issue was introduced by commit 0db0c01b53a1 ("procfs: fix /proc//maps heap check"). Let's fix above corner cases, meanwhile, correct the wrong indentation of the stack and heap check helpers. Fixes: 11250fd12eb8 ("mm: factor out VMA stack and heap checks") Signed-off-by: Kefeng Wang Reported-by: Ondrej Mosnacek Closes: https://lore.kernel.org/selinux/CAFqZXNv0SVT0fkOK6neP9AXbj3nxJ61JAY4+zJzvxqJaeuhbFw@mail.gmail.com/ Tested-by: Ondrej Mosnacek Link: https://lkml.kernel.org/r/20231207152525.2607420-1-wangkefeng.wang@huawei.com Cc: David Hildenbrand Cc: Paul Moore Cc: Peter Zijlstra Cc: Stephen Smalley Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 418d26608ece7..da5219b48d522 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -886,8 +886,8 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma) */ static inline bool vma_is_initial_heap(const struct vm_area_struct *vma) { - return vma->vm_start <= vma->vm_mm->brk && - vma->vm_end >= vma->vm_mm->start_brk; + return vma->vm_start < vma->vm_mm->brk && + vma->vm_end > vma->vm_mm->start_brk; } /* @@ -901,8 +901,8 @@ static inline bool vma_is_initial_stack(const struct vm_area_struct *vma) * its "stack". It's not even well-defined for programs written * languages like Go. */ - return vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack; + return vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack; } static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) -- cgit v1.2.3 From 6376a824595607e99d032a39ba3394988b4fce96 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 8 Dec 2023 17:50:18 +0000 Subject: mm/damon/core: make damon_start() waits until kdamond_fn() starts The cleanup tasks of kdamond threads including reset of corresponding DAMON context's ->kdamond field and decrease of global nr_running_ctxs counter is supposed to be executed by kdamond_fn(). However, commit 0f91d13366a4 ("mm/damon: simplify stop mechanism") made neither damon_start() nor damon_stop() ensure the corresponding kdamond has started the execution of kdamond_fn(). As a result, the cleanup can be skipped if damon_stop() is called fast enough after the previous damon_start(). Especially the skipped reset of ->kdamond could cause a use-after-free. Fix it by waiting for start of kdamond_fn() execution from damon_start(). Link: https://lkml.kernel.org/r/20231208175018.63880-1-sj@kernel.org Fixes: 0f91d13366a4 ("mm/damon: simplify stop mechanism") Signed-off-by: SeongJae Park Reported-by: Jakub Acs Cc: Changbin Du Cc: Jakub Acs Cc: # 5.15.x Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index ab2f17d9926b5..e00ddf1ed39c0 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -559,6 +559,8 @@ struct damon_ctx { * update */ unsigned long next_ops_update_sis; + /* for waiting until the execution of the kdamond_fn is started */ + struct completion kdamond_started; /* public: */ struct task_struct *kdamond; -- cgit v1.2.3 From 081488051d28d32569ebb7c7a23572778b2e7d57 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 7 Dec 2023 23:14:04 -0700 Subject: mm/mglru: fix underprotected page cache Unmapped folios accessed through file descriptors can be underprotected. Those folios are added to the oldest generation based on: 1. The fact that they are less costly to reclaim (no need to walk the rmap and flush the TLB) and have less impact on performance (don't cause major PFs and can be non-blocking if needed again). 2. The observation that they are likely to be single-use. E.g., for client use cases like Android, its apps parse configuration files and store the data in heap (anon); for server use cases like MySQL, it reads from InnoDB files and holds the cached data for tables in buffer pools (anon). However, the oldest generation can be very short lived, and if so, it doesn't provide the PID controller with enough time to respond to a surge of refaults. (Note that the PID controller uses weighted refaults and those from evicted generations only take a half of the whole weight.) In other words, for a short lived generation, the moving average smooths out the spike quickly. To fix the problem: 1. For folios that are already on LRU, if they can be beyond the tracking range of tiers, i.e., five accesses through file descriptors, move them to the second oldest generation to give them more time to age. (Note that tiers are used by the PID controller to statistically determine whether folios accessed multiple times through file descriptors are worth protecting.) 2. When adding unmapped folios to LRU, adjust the placement of them so that they are not too close to the tail. The effect of this is similar to the above. On Android, launching 55 apps sequentially: Before After Change workingset_refault_anon 25641024 25598972 0% workingset_refault_file 115016834 106178438 -8% Link: https://lkml.kernel.org/r/20231208061407.2125867-1-yuzhao@google.com Fixes: ac35a4902374 ("mm: multi-gen LRU: minimal implementation") Signed-off-by: Yu Zhao Reported-by: Charan Teja Kalla Tested-by: Kalesh Singh Cc: T.J. Mercier Cc: Kairui Song Cc: Hillf Danton Cc: Jaroslav Pulchart Cc: Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 9ae7def16cb2a..f4fe593c1400e 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -232,22 +232,27 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, if (folio_test_unevictable(folio) || !lrugen->enabled) return false; /* - * There are three common cases for this page: - * 1. If it's hot, e.g., freshly faulted in or previously hot and - * migrated, add it to the youngest generation. - * 2. If it's cold but can't be evicted immediately, i.e., an anon page - * not in swapcache or a dirty page pending writeback, add it to the - * second oldest generation. - * 3. Everything else (clean, cold) is added to the oldest generation. + * There are four common cases for this page: + * 1. If it's hot, i.e., freshly faulted in, add it to the youngest + * generation, and it's protected over the rest below. + * 2. If it can't be evicted immediately, i.e., a dirty page pending + * writeback, add it to the second youngest generation. + * 3. If it should be evicted first, e.g., cold and clean from + * folio_rotate_reclaimable(), add it to the oldest generation. + * 4. Everything else falls between 2 & 3 above and is added to the + * second oldest generation if it's considered inactive, or the + * oldest generation otherwise. See lru_gen_is_active(). */ if (folio_test_active(folio)) seq = lrugen->max_seq; else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) || (folio_test_reclaim(folio) && (folio_test_dirty(folio) || folio_test_writeback(folio)))) - seq = lrugen->min_seq[type] + 1; - else + seq = lrugen->max_seq - 1; + else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq) seq = lrugen->min_seq[type]; + else + seq = lrugen->min_seq[type] + 1; gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; -- cgit v1.2.3 From 8aa420617918d12d1f5d55030a503c9418e73c2c Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 7 Dec 2023 23:14:06 -0700 Subject: mm/mglru: respect min_ttl_ms with memcgs While investigating kswapd "consuming 100% CPU" [1] (also see "mm/mglru: try to stop at high watermarks"), it was discovered that the memcg LRU can breach the thrashing protection imposed by min_ttl_ms. Before the memcg LRU: kswapd() shrink_node_memcgs() mem_cgroup_iter() inc_max_seq() // always hit a different memcg lru_gen_age_node() mem_cgroup_iter() check the timestamp of the oldest generation After the memcg LRU: kswapd() shrink_many() restart: iterate the memcg LRU: inc_max_seq() // occasionally hit the same memcg if raced with lru_gen_rotate_memcg(): goto restart lru_gen_age_node() mem_cgroup_iter() check the timestamp of the oldest generation Specifically, when the restart happens in shrink_many(), it needs to stick with the (memcg LRU) generation it began with. In other words, it should neither re-read memcg_lru->seq nor age an lruvec of a different generation. Otherwise it can hit the same memcg multiple times without giving lru_gen_age_node() a chance to check the timestamp of that memcg's oldest generation (against min_ttl_ms). [1] https://lore.kernel.org/CAK8fFZ4DY+GtBA40Pm7Nn5xCHy+51w3sfxPqkqpqakSXYyX+Wg@mail.gmail.com/ Link: https://lkml.kernel.org/r/20231208061407.2125867-3-yuzhao@google.com Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists") Signed-off-by: Yu Zhao Tested-by: T.J. Mercier Cc: Charan Teja Kalla Cc: Hillf Danton Cc: Jaroslav Pulchart Cc: Kairui Song Cc: Kalesh Singh Cc: Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3c25226beeed4..23533b12bee2f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -505,33 +505,37 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); * the old generation, is incremented when all its bins become empty. * * There are four operations: - * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its + * 1. MEMCG_LRU_HEAD, which moves a memcg to the head of a random bin in its * current generation (old or young) and updates its "seg" to "head"; - * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its + * 2. MEMCG_LRU_TAIL, which moves a memcg to the tail of a random bin in its * current generation (old or young) and updates its "seg" to "tail"; - * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old + * 3. MEMCG_LRU_OLD, which moves a memcg to the head of a random bin in the old * generation, updates its "gen" to "old" and resets its "seg" to "default"; - * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the + * 4. MEMCG_LRU_YOUNG, which moves a memcg to the tail of a random bin in the * young generation, updates its "gen" to "young" and resets its "seg" to * "default". * * The events that trigger the above operations are: * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; - * 2. The first attempt to reclaim an memcg below low, which triggers + * 2. The first attempt to reclaim a memcg below low, which triggers * MEMCG_LRU_TAIL; - * 3. The first attempt to reclaim an memcg below reclaimable size threshold, + * 3. The first attempt to reclaim a memcg below reclaimable size threshold, * which triggers MEMCG_LRU_TAIL; - * 4. The second attempt to reclaim an memcg below reclaimable size threshold, + * 4. The second attempt to reclaim a memcg below reclaimable size threshold, * which triggers MEMCG_LRU_YOUNG; - * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG; + * 5. Attempting to reclaim a memcg below min, which triggers MEMCG_LRU_YOUNG; * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; - * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD. + * 7. Offlining a memcg, which triggers MEMCG_LRU_OLD. * - * Note that memcg LRU only applies to global reclaim, and the round-robin - * incrementing of their max_seq counters ensures the eventual fairness to all - * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). + * Notes: + * 1. Memcg LRU only applies to global reclaim, and the round-robin incrementing + * of their max_seq counters ensures the eventual fairness to all eligible + * memcgs. For memcg reclaim, it still relies on mem_cgroup_iter(). + * 2. There are only two valid generations: old (seq) and young (seq+1). + * MEMCG_NR_GENS is set to three so that when reading the generation counter + * locklessly, a stale value (seq-1) does not wraparound to young. */ -#define MEMCG_NR_GENS 2 +#define MEMCG_NR_GENS 3 #define MEMCG_NR_BINS 8 struct lru_gen_memcg { -- cgit v1.2.3 From 4376807bf2d5371c3e00080c972be568c3f8a7d1 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 7 Dec 2023 23:14:07 -0700 Subject: mm/mglru: reclaim offlined memcgs harder In the effort to reduce zombie memcgs [1], it was discovered that the memcg LRU doesn't apply enough pressure on offlined memcgs. Specifically, instead of rotating them to the tail of the current generation (MEMCG_LRU_TAIL) for a second attempt, it moves them to the next generation (MEMCG_LRU_YOUNG) after the first attempt. Not applying enough pressure on offlined memcgs can cause them to build up, and this can be particularly harmful to memory-constrained systems. On Pixel 8 Pro, launching apps for 50 cycles: Before After Change Zombie memcgs 45 35 -22% [1] https://lore.kernel.org/CABdmKX2M6koq4Q0Cmp_-=wbP0Qa190HdEGGaHfxNS05gAkUtPA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20231208061407.2125867-4-yuzhao@google.com Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists") Signed-off-by: Yu Zhao Reported-by: T.J. Mercier Tested-by: T.J. Mercier Cc: Charan Teja Kalla Cc: Hillf Danton Cc: Jaroslav Pulchart Cc: Kairui Song Cc: Kalesh Singh Cc: Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 23533b12bee2f..9db36e1977125 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -519,10 +519,10 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD; * 2. The first attempt to reclaim a memcg below low, which triggers * MEMCG_LRU_TAIL; - * 3. The first attempt to reclaim a memcg below reclaimable size threshold, - * which triggers MEMCG_LRU_TAIL; - * 4. The second attempt to reclaim a memcg below reclaimable size threshold, - * which triggers MEMCG_LRU_YOUNG; + * 3. The first attempt to reclaim a memcg offlined or below reclaimable size + * threshold, which triggers MEMCG_LRU_TAIL; + * 4. The second attempt to reclaim a memcg offlined or below reclaimable size + * threshold, which triggers MEMCG_LRU_YOUNG; * 5. Attempting to reclaim a memcg below min, which triggers MEMCG_LRU_YOUNG; * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG; * 7. Offlining a memcg, which triggers MEMCG_LRU_OLD. -- cgit v1.2.3 From 05ce71929efc79f5978589e0456a54eb0fe6485e Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Mon, 27 Nov 2023 16:19:31 +0100 Subject: PM: domains: Drop the unused pm_genpd_opp_to_performance_state() Since commit 7c41cdcd3bbe ("OPP: Simplify the over-designed pstate <-> level dance"), there is no longer any users of the pm_genpd_opp_to_performance_state() API. Let's therefore drop it and its corresponding ->opp_to_performance_state() callback, which also no longer has any users. Signed-off-by: Ulf Hansson Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20231127151931.47055-1-ulf.hansson@linaro.org --- include/linux/pm_domain.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 34663d0d5c559..b97c5e9820f97 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -118,7 +118,6 @@ struct genpd_power_state { }; struct genpd_lock_ops; -struct dev_pm_opp; struct opp_table; struct generic_pm_domain { @@ -146,8 +145,6 @@ struct generic_pm_domain { int (*power_on)(struct generic_pm_domain *domain); struct raw_notifier_head power_notifiers; /* Power on/off notifiers */ struct opp_table *opp_table; /* OPP table of the genpd */ - unsigned int (*opp_to_performance_state)(struct generic_pm_domain *genpd, - struct dev_pm_opp *opp); int (*set_performance_state)(struct generic_pm_domain *genpd, unsigned int state); struct gpd_dev_ops dev_ops; @@ -348,8 +345,6 @@ int of_genpd_remove_subdomain(struct of_phandle_args *parent_spec, struct generic_pm_domain *of_genpd_remove_last(struct device_node *np); int of_genpd_parse_idle_states(struct device_node *dn, struct genpd_power_state **states, int *n); -unsigned int pm_genpd_opp_to_performance_state(struct device *genpd_dev, - struct dev_pm_opp *opp); int genpd_dev_pm_attach(struct device *dev); struct device *genpd_dev_pm_attach_by_id(struct device *dev, @@ -395,13 +390,6 @@ static inline int of_genpd_parse_idle_states(struct device_node *dn, return -ENODEV; } -static inline unsigned int -pm_genpd_opp_to_performance_state(struct device *genpd_dev, - struct dev_pm_opp *opp) -{ - return 0; -} - static inline int genpd_dev_pm_attach(struct device *dev) { return 0; -- cgit v1.2.3 From 4f7aa122bc9219baca0bfface5917062d6c45ee8 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 7 Dec 2023 16:12:04 +0100 Subject: dpll: remove leftover mode_supported() op and use mode_get() instead Mode supported is currently reported to the user exactly the same, as the current mode. That's because mode changing is not implemented. Remove the leftover mode_supported() op and use mode_get() to fill up the supported mode exposed to user. One, if even, mode changing is going to be introduced, this could be very easily taken back. In the meantime, prevent drivers form implementing this in wrong way (as for example recent netdevsim implementation attempt intended to do). Signed-off-by: Jiri Pirko Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/dpll.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dpll.h b/include/linux/dpll.h index 578fc5fa3750c..b1a5f9ca8ee5d 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -17,9 +17,6 @@ struct dpll_pin; struct dpll_device_ops { int (*mode_get)(const struct dpll_device *dpll, void *dpll_priv, enum dpll_mode *mode, struct netlink_ext_ack *extack); - bool (*mode_supported)(const struct dpll_device *dpll, void *dpll_priv, - const enum dpll_mode mode, - struct netlink_ext_ack *extack); int (*lock_status_get)(const struct dpll_device *dpll, void *dpll_priv, enum dpll_lock_status *status, struct netlink_ext_ack *extack); -- cgit v1.2.3 From 59b3e31e73322ec195e45e0a1da712c752ee1b0c Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 28 Nov 2023 04:00:10 +0000 Subject: leds: trigger: netdev: Extend speeds up to 10G Add 2.5G, 5G and 10G as available speeds to the netdev LED trigger. Signed-off-by: Daniel Golle Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/99e7d3304c6bba7f4863a4a80764a869855f2085.1701143925.git.daniel@makrotopia.org Signed-off-by: Lee Jones --- include/linux/leds.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/leds.h b/include/linux/leds.h index aa16dc2a8230f..1bdf7f5a0d7c0 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -588,6 +588,9 @@ enum led_trigger_netdev_modes { TRIGGER_NETDEV_LINK_10, TRIGGER_NETDEV_LINK_100, TRIGGER_NETDEV_LINK_1000, + TRIGGER_NETDEV_LINK_2500, + TRIGGER_NETDEV_LINK_5000, + TRIGGER_NETDEV_LINK_10000, TRIGGER_NETDEV_HALF_DUPLEX, TRIGGER_NETDEV_FULL_DUPLEX, TRIGGER_NETDEV_TX, -- cgit v1.2.3 From 4ff4379ce6eefe81695bcc2e021ce1dac3d707d2 Mon Sep 17 00:00:00 2001 From: Florian Eckert Date: Mon, 27 Nov 2023 12:03:08 +0100 Subject: tty: add new helper function tty_get_tiocm There is no in-kernel function to get the status register of a tty device like the TIOCMGET ioctl returns to userspace. Create a new function, tty_get_tiocm(), to obtain the status register that other portions of the kernel can call if they need this information, and move the existing internal tty_tiocmget() function to use this interface. Signed-off-by: Florian Eckert Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20231127110311.3583957-2-fe@dev.tdt.de Signed-off-by: Lee Jones --- include/linux/tty.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tty.h b/include/linux/tty.h index 4b6340ac2af28..d219a11e3fe0f 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -419,6 +419,7 @@ bool tty_unthrottle_safe(struct tty_struct *tty); int tty_do_resize(struct tty_struct *tty, struct winsize *ws); int tty_get_icount(struct tty_struct *tty, struct serial_icounter_struct *icount); +int tty_get_tiocm(struct tty_struct *tty); int is_current_pgrp_orphaned(void); void tty_hangup(struct tty_struct *tty); void tty_vhangup(struct tty_struct *tty); -- cgit v1.2.3 From bdc22c8d52d70fc5655ab4dbf72fa79b034bb7b5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 5 Dec 2023 20:18:39 +0100 Subject: thermal: trip: Send trip change notifications on all trip updates The _store callbacks of the trip point temperature and hysteresis sysfs attributes invoke thermal_notify_tz_trip_change() to send a notification regarding the trip point change, but when trip points are updated by the platform firmware, trip point change notifications are not sent. To make the behavior after a trip point change more consistent, modify all of the 3 places where trip point temperature is updated to use a new function called thermal_zone_set_trip_temp() for this purpose and make that function call thermal_notify_tz_trip_change(). Note that trip point hysteresis can only be updated via sysfs and trip_point_hyst_store() calls thermal_notify_tz_trip_change() already, so this code path need not be changed. Signed-off-by: Rafael J. Wysocki Acked-by: Daniel Lezcano --- include/linux/thermal.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index bedbaec9a42e1..09f6eb82c191c 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -291,6 +291,8 @@ int thermal_zone_for_each_trip(struct thermal_zone_device *tz, int (*cb)(struct thermal_trip *, void *), void *data); int thermal_zone_get_num_trips(struct thermal_zone_device *tz); +void thermal_zone_set_trip_temp(struct thermal_zone_device *tz, + struct thermal_trip *trip, int temp); int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp); -- cgit v1.2.3 From ad6534c626fedd818718d76c36d69c7d8e7b61cc Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 8 Dec 2023 10:56:49 +0800 Subject: PCI: Add Alibaba Vendor ID to linux/pci_ids.h The Alibaba Vendor ID (0x1ded) is now used by Alibaba elasticRDMA ("erdma") and will be shared with the upcoming PCIe PMU ("dwc_pcie_pmu"). Move the Vendor ID to linux/pci_ids.h so that it can shared by several drivers later. Signed-off-by: Shuai Xue Acked-by: Bjorn Helgaas # pci_ids.h Tested-by: Ilkka Koskinen Link: https://lore.kernel.org/r/20231208025652.87192-3-xueshuai@linux.alibaba.com Signed-off-by: Will Deacon --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 275799b5f535c..844ffdac8d7d1 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2605,6 +2605,8 @@ #define PCI_VENDOR_ID_TEKRAM 0x1de1 #define PCI_DEVICE_ID_TEKRAM_DC290 0xdc29 +#define PCI_VENDOR_ID_ALIBABA 0x1ded + #define PCI_VENDOR_ID_TEHUTI 0x1fc9 #define PCI_DEVICE_ID_TEHUTI_3009 0x3009 #define PCI_DEVICE_ID_TEHUTI_3010 0x3010 -- cgit v1.2.3 From ac16087134b837d42b75bb1c741070b6c142f258 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 8 Dec 2023 10:56:50 +0800 Subject: PCI: Move pci_clear_and_set_dword() helper to PCI header The clear and set pattern is commonly used for accessing PCI config, move the helper pci_clear_and_set_dword() from aspm.c into PCI header. In addition, rename to pci_clear_and_set_config_dword() to retain the "config" information and match the other accessors. No functional change intended. Signed-off-by: Shuai Xue Acked-by: Bjorn Helgaas Tested-by: Ilkka Koskinen Link: https://lore.kernel.org/r/20231208025652.87192-4-xueshuai@linux.alibaba.com Signed-off-by: Will Deacon --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 60ca768bc8679..268c4bd98ef3e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1239,6 +1239,8 @@ int pci_read_config_dword(const struct pci_dev *dev, int where, u32 *val); int pci_write_config_byte(const struct pci_dev *dev, int where, u8 val); int pci_write_config_word(const struct pci_dev *dev, int where, u16 val); int pci_write_config_dword(const struct pci_dev *dev, int where, u32 val); +void pci_clear_and_set_config_dword(const struct pci_dev *dev, int pos, + u32 clear, u32 set); int pcie_capability_read_word(struct pci_dev *dev, int pos, u16 *val); int pcie_capability_read_dword(struct pci_dev *dev, int pos, u32 *val); -- cgit v1.2.3 From c82a1662d4548c454de5343b88f69b9fc82266b3 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 8 Dec 2023 23:56:41 +0100 Subject: leds: trigger: Remove unused function led_trigger_rename_static() This function was added with a8df7b1ab70b ("leds: add led_trigger_rename function") 11 yrs ago, but it has no users. So remove it. Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/d90f30be-f661-4db7-b0b5-d09d07a78a68@gmail.com Signed-off-by: Lee Jones --- include/linux/leds.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/leds.h b/include/linux/leds.h index 1bdf7f5a0d7c0..4754b02d3a2c5 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -527,23 +527,6 @@ static inline void *led_get_trigger_data(struct led_classdev *led_cdev) return led_cdev->trigger_data; } -/** - * led_trigger_rename_static - rename a trigger - * @name: the new trigger name - * @trig: the LED trigger to rename - * - * Change a LED trigger name by copying the string passed in - * name into current trigger name, which MUST be large - * enough for the new string. - * - * Note that name must NOT point to the same string used - * during LED registration, as that could lead to races. - * - * This is meant to be used on triggers with statically - * allocated name. - */ -void led_trigger_rename_static(const char *name, struct led_trigger *trig); - #define module_led_trigger(__led_trigger) \ module_driver(__led_trigger, led_trigger_register, \ led_trigger_unregister) -- cgit v1.2.3 From 595e52284d24adc376890d3fc93bdca4707d9aca Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 13 Dec 2023 08:58:15 -0700 Subject: io_uring/poll: don't enable lazy wake for POLLEXCLUSIVE There are a few quirks around using lazy wake for poll unconditionally, and one of them is related the EPOLLEXCLUSIVE. Those may trigger exclusive wakeups, which wake a limited number of entries in the wait queue. If that wake number is less than the number of entries someone is waiting for (and that someone is also using DEFER_TASKRUN), then we can get stuck waiting for more entries while we should be processing the ones we already got. If we're doing exclusive poll waits, flag the request as not being compatible with lazy wakeups. Reported-by: Pavel Begunkov Fixes: 6ce4a93dbb5b ("io_uring/poll: use IOU_F_TWQ_LAZY_WAKE for wakeups") Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 805bb635cdf55..239a4f68801bb 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -434,6 +434,7 @@ enum { /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, + REQ_F_POLL_NO_LAZY_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -501,6 +502,8 @@ enum { REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), /* hashed into ->cancel_hash_locked, protected by ->uring_lock */ REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT), + /* don't use lazy poll wake for this request */ + REQ_F_POLL_NO_LAZY = BIT(REQ_F_POLL_NO_LAZY_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); -- cgit v1.2.3 From 3c6b0c1c28184038d90dffe8eb542bedcb8ccf98 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 30 Nov 2023 14:27:29 +0100 Subject: srcu: Use try-lock lockdep annotation for NMI-safe access. It is claimed that srcu_read_lock_nmisafe() NMI-safe. However it triggers a lockdep if used from NMI because lockdep expects a deadlock since nothing disables NMIs while the lock is acquired. This is because commit f0f44752f5f61 ("rcu: Annotate SRCU's update-side lockdep dependencies") annotates synchronize_srcu() as a write lock usage. This helps to detect a deadlocks such as srcu_read_lock(); synchronize_srcu(); srcu_read_unlock(); The side effect is that the lock srcu_struct now has a USED usage in normal contexts, so it conflicts with a USED_READ usage in NMI. But this shouldn't cause a real deadlock because the write lock usage from synchronize_srcu() is a fake one and only used for read/write deadlock detection. Use a try-lock annotation for srcu_read_lock_nmisafe() to avoid lockdep complains if used from NMI. Fixes: f0f44752f5f6 ("rcu: Annotate SRCU's update-side lockdep dependencies") Link: https://lore.kernel.org/r/20230927160231.XRCDDSK4@linutronix.de Reviewed-by: Boqun Feng Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- include/linux/rcupdate.h | 6 ++++++ include/linux/srcu.h | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index f7206b2623c98..31d523c4e0893 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -301,6 +301,11 @@ static inline void rcu_lock_acquire(struct lockdep_map *map) lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_); } +static inline void rcu_try_lock_acquire(struct lockdep_map *map) +{ + lock_acquire(map, 0, 1, 2, 0, NULL, _THIS_IP_); +} + static inline void rcu_lock_release(struct lockdep_map *map) { lock_release(map, _THIS_IP_); @@ -315,6 +320,7 @@ int rcu_read_lock_any_held(void); #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ # define rcu_lock_acquire(a) do { } while (0) +# define rcu_try_lock_acquire(a) do { } while (0) # define rcu_lock_release(a) do { } while (0) static inline int rcu_read_lock_held(void) diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 127ef3b2e6073..236610e4a8fa5 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -229,7 +229,7 @@ static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp srcu_check_nmi_safety(ssp, true); retval = __srcu_read_lock_nmisafe(ssp); - rcu_lock_acquire(&ssp->dep_map); + rcu_try_lock_acquire(&ssp->dep_map); return retval; } -- cgit v1.2.3 From 493dffa3ab07b5d2c0b7bd5de5bff6e85f01f52a Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Wed, 20 Sep 2023 11:22:12 +0200 Subject: rculist.h: docs: Fix wrong function summary The brief summary in the docstring for function list_next_or_null_rcu() states that the function is supposed to provide the "first" member of a list, whereas in truth it returns the next member. Change the docstring so it describes what the function actually does. Signed-off-by: Philipp Stanner Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- include/linux/rculist.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index d29740be4833e..3dc1e58865f77 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -355,7 +355,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, }) /** - * list_next_or_null_rcu - get the first element from a list + * list_next_or_null_rcu - get the next element from a list * @head: the head for the list. * @ptr: the list head to take the next element from. * @type: the type of the struct this is embedded in. -- cgit v1.2.3 From 750e785796bb72423b97cac21ecd0fa3b3b65610 Mon Sep 17 00:00:00 2001 From: Jie Jiang Date: Tue, 12 Dec 2023 09:39:23 +0000 Subject: bpf: Support uid and gid when mounting bpffs Parse uid and gid in bpf_parse_param() so that they can be passed in as the `data` parameter when mount() bpffs. This will be useful when we want to control which user/group has the control to the mounted bpffs, otherwise a separate chown() call will be needed. Signed-off-by: Jie Jiang Signed-off-by: Andrii Nakryiko Acked-by: Mike Frysinger Acked-by: Christian Brauner Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231212093923.497838-1-jiejiang@chromium.org --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0bd4889e917a3..c87c608a36892 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1595,6 +1595,8 @@ struct bpf_link_primer { }; struct bpf_mount_opts { + kuid_t uid; + kgid_t gid; umode_t mode; /* BPF token-related delegation options */ -- cgit v1.2.3 From 537fec0733c4a72e2a2b69fee365459c5b75d92e Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Tue, 5 Dec 2023 22:08:42 +0100 Subject: net: make vlan_get_tag() return -ENODATA instead of -EINVAL __vlan_hwaccel_get_tag() is used in veth XDP hints implementation, its return value (-EINVAL if skb is not VLAN tagged) is passed to bpf code, but XDP hints specification requires drivers to return -ENODATA, if a hint cannot be provided for a particular packet. Solve this inconsistency by changing error return value of __vlan_hwaccel_get_tag() from -EINVAL to -ENODATA, do the same thing to __vlan_get_tag(), because this function is supposed to follow the same convention. This, in turn, makes -ENODATA the only non-zero value vlan_get_tag() can return. We can do this with no side effects, because none of the users of the 3 above-mentioned functions rely on the exact value. Suggested-by: Jesper Dangaard Brouer Acked-by: Stanislav Fomichev Signed-off-by: Larysa Zaremba Link: https://lore.kernel.org/r/20231205210847.28460-14-larysa.zaremba@intel.com Signed-off-by: Alexei Starovoitov --- include/linux/if_vlan.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 3028af87716e2..c1645c86eed96 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -540,7 +540,7 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) struct vlan_ethhdr *veth = skb_vlan_eth_hdr(skb); if (!eth_type_vlan(veth->h_vlan_proto)) - return -EINVAL; + return -ENODATA; *vlan_tci = ntohs(veth->h_vlan_TCI); return 0; @@ -561,7 +561,7 @@ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb, return 0; } else { *vlan_tci = 0; - return -EINVAL; + return -ENODATA; } } -- cgit v1.2.3 From 7978bad4b6b9265a1e808a5f679ee428d1dd6523 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Tue, 5 Dec 2023 22:08:43 +0100 Subject: mlx5: implement VLAN tag XDP hint Implement the newly added .xmo_rx_vlan_tag() hint function. Reviewed-by: Tariq Toukan Signed-off-by: Larysa Zaremba Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20231205210847.28460-15-larysa.zaremba@intel.com Signed-off-by: Alexei Starovoitov --- include/linux/mlx5/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 820bca965fb6f..01275c6e84688 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -918,7 +918,7 @@ static inline u8 get_cqe_tls_offload(struct mlx5_cqe64 *cqe) return (cqe->tls_outer_l3_tunneled >> 3) & 0x3; } -static inline bool cqe_has_vlan(struct mlx5_cqe64 *cqe) +static inline bool cqe_has_vlan(const struct mlx5_cqe64 *cqe) { return cqe->l4_l3_hdr_type & 0x1; } -- cgit v1.2.3 From 0c476157085fe2ad13b9bec70ea672e86647fa1a Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 12 Dec 2023 06:41:43 +0100 Subject: net: phy: c45: add genphy_c45_pma_read_ext_abilities() function Move part of the genphy_c45_pma_read_abilities() code to a separate function. Some PHYs do not implement PMA/PMD status 2 register (Register 1.8) but do implement PMA/PMD extended ability register (Register 1.11). To make use of it, we need to be able to access this part of code separately. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Link: https://lore.kernel.org/r/20231212054144.87527-2-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 6e7ebcc50b859..dbb5e13e3e1bf 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1866,6 +1866,7 @@ int genphy_c45_an_config_aneg(struct phy_device *phydev); int genphy_c45_an_disable_aneg(struct phy_device *phydev); int genphy_c45_read_mdix(struct phy_device *phydev); int genphy_c45_pma_read_abilities(struct phy_device *phydev); +int genphy_c45_pma_read_ext_abilities(struct phy_device *phydev); int genphy_c45_pma_baset1_read_abilities(struct phy_device *phydev); int genphy_c45_read_eee_abilities(struct phy_device *phydev); int genphy_c45_pma_baset1_read_master_slave(struct phy_device *phydev); -- cgit v1.2.3 From 13049408a4bd29c92227ca2d6befab80dbb96663 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 7 May 2023 16:47:42 +0300 Subject: net/mlx5: Add mlx5_ifc bits used for supporting single netdev Socket-Direct Multiple device caps and features are required to support single netdev Socket-Direct. Add them here in preparation for the feature implementation. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ce2e71cd6d2a3..405d141b4a085 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -435,7 +435,7 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 flow_table_modify[0x1]; u8 reformat[0x1]; u8 decap[0x1]; - u8 reserved_at_9[0x1]; + u8 reset_root_to_default[0x1]; u8 pop_vlan[0x1]; u8 push_vlan[0x1]; u8 reserved_at_c[0x1]; @@ -1801,7 +1801,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 disable_local_lb_uc[0x1]; u8 disable_local_lb_mc[0x1]; u8 log_min_hairpin_wq_data_sz[0x5]; - u8 reserved_at_3e8[0x2]; + u8 reserved_at_3e8[0x1]; + u8 silent_mode[0x1]; u8 vhca_state[0x1]; u8 log_max_vlan_list[0x5]; u8 reserved_at_3f0[0x3]; @@ -1818,7 +1819,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_460[0x1]; u8 ats[0x1]; - u8 reserved_at_462[0x1]; + u8 cross_vhca_rqt[0x1]; u8 log_max_uctx[0x5]; u8 reserved_at_468[0x1]; u8 crypto[0x1]; @@ -1943,6 +1944,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { enum { MLX5_CROSS_VHCA_OBJ_TO_OBJ_SUPPORTED_LOCAL_FLOW_TABLE_TO_REMOTE_FLOW_TABLE_MISS = 0x80000, + MLX5_CROSS_VHCA_OBJ_TO_OBJ_SUPPORTED_LOCAL_FLOW_TABLE_ROOT_TO_REMOTE_FLOW_TABLE = (1ULL << 20), }; enum { @@ -1992,7 +1994,11 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_260[0x120]; u8 reserved_at_380[0x10]; u8 ec_vf_vport_base[0x10]; - u8 reserved_at_3a0[0x460]; + + u8 reserved_at_3a0[0x10]; + u8 max_rqt_vhca_id[0x10]; + + u8 reserved_at_3c0[0x440]; }; enum mlx5_ifc_flow_destination_type { @@ -2151,6 +2157,13 @@ struct mlx5_ifc_rq_num_bits { u8 rq_num[0x18]; }; +struct mlx5_ifc_rq_vhca_bits { + u8 reserved_at_0[0x8]; + u8 rq_num[0x18]; + u8 reserved_at_20[0x10]; + u8 rq_vhca_id[0x10]; +}; + struct mlx5_ifc_mac_address_layout_bits { u8 reserved_at_0[0x10]; u8 mac_addr_47_32[0x10]; @@ -3901,7 +3914,10 @@ struct mlx5_ifc_rqtc_bits { u8 reserved_at_e0[0x6a0]; - struct mlx5_ifc_rq_num_bits rq_num[]; + union { + DECLARE_FLEX_ARRAY(struct mlx5_ifc_rq_num_bits, rq_num); + DECLARE_FLEX_ARRAY(struct mlx5_ifc_rq_vhca_bits, rq_vhca); + }; }; enum { @@ -4744,7 +4760,10 @@ struct mlx5_ifc_set_l2_table_entry_in_bits { u8 reserved_at_c0[0x20]; - u8 reserved_at_e0[0x13]; + u8 reserved_at_e0[0x10]; + u8 silent_mode_valid[0x1]; + u8 silent_mode[0x1]; + u8 reserved_at_f2[0x1]; u8 vlan_valid[0x1]; u8 vlan[0xc]; -- cgit v1.2.3 From f5e956329960903d908668d7a20bbc08e0a8b92b Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 7 Aug 2023 09:05:34 +0300 Subject: net/mlx5: Expose Management PCIe Index Register (MPIR) MPIR register allows to query the PCIe indexes and Socket-Direct related parameters. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 14 ++++++++++++++ 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index d2b8d4a74a308..2f67cec1a898d 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -150,6 +150,7 @@ enum { MLX5_REG_MTPPSE = 0x9054, MLX5_REG_MTUTC = 0x9055, MLX5_REG_MPEGC = 0x9056, + MLX5_REG_MPIR = 0x9059, MLX5_REG_MCQS = 0x9060, MLX5_REG_MCQI = 0x9061, MLX5_REG_MCC = 0x9062, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 405d141b4a085..828938368fb7f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10108,6 +10108,20 @@ struct mlx5_ifc_mpegc_reg_bits { u8 reserved_at_60[0x100]; }; +struct mlx5_ifc_mpir_reg_bits { + u8 sdm[0x1]; + u8 reserved_at_1[0x1b]; + u8 host_buses[0x4]; + + u8 reserved_at_20[0x20]; + + u8 local_port[0x8]; + u8 reserved_at_28[0x15]; + u8 sd_group[0x3]; + + u8 reserved_at_60[0x20]; +}; + enum { MLX5_MTUTC_FREQ_ADJ_UNITS_PPB = 0x0, MLX5_MTUTC_FREQ_ADJ_UNITS_SCALED_PPM = 0x1, -- cgit v1.2.3 From b25bd37c859f32e50a436ab9d2078b76e433008e Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 6 Aug 2023 14:01:10 +0300 Subject: net/mlx5: Move TISes from priv to mdev HW resources The transport interface send (TIS) object is responsible for performing all transport related operations of the transmit side. Messages from Send Queues get segmented and transmitted by the TIS including all transport required implications, e.g. in the case of large send offload, the TIS is responsible for the segmentation. These are stateless objects and can be used by multiple netdevs (e.g. representors) who share the same core device. Providing the TISes as a service from the core layer to the netdev layer reduces the number of replecated TIS objects (in case of multiple netdevs), and will ease the transition to netdev with multiple mdevs. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 2f67cec1a898d..7ee5b79ff3d60 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -679,6 +679,8 @@ struct mlx5e_resources { struct mlx5_td td; u32 mkey; struct mlx5_sq_bfreg bfreg; +#define MLX5_MAX_NUM_TC 8 + u32 tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC]; } hw_objs; struct net_device *uplink_netdev; struct mutex uplink_netdev_lock; -- cgit v1.2.3 From 50d73710715de7d1a2c88194562f520816af9c2a Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 12 Dec 2023 15:27:51 +0100 Subject: ethtool: add SET for TCP_DATA_SPLIT ringparam Follow up commit 9690ae604290 ("ethtool: add header/data split indication") and add the set part of Ethtool's header split, i.e. ability to enable/disable header split via the Ethtool Netlink interface. This might be helpful to optimize the setup for particular workloads, for example, to avoid XDP frags, and so on. A driver should advertise ``ETHTOOL_RING_USE_TCP_DATA_SPLIT`` in its ops->supported_ring_params to allow doing that. "Unknown" passed from the userspace when the header split is supported means the driver is free to choose the preferred state. Reviewed-by: Przemek Kitszel Signed-off-by: Alexander Lobakin Link: https://lore.kernel.org/r/20231212142752.935000-2-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index deb683d3360f0..67b30940234be 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -95,6 +95,7 @@ struct kernel_ethtool_ringparam { * @ETHTOOL_RING_USE_TX_PUSH: capture for setting tx_push * @ETHTOOL_RING_USE_RX_PUSH: capture for setting rx_push * @ETHTOOL_RING_USE_TX_PUSH_BUF_LEN: capture for setting tx_push_buf_len + * @ETHTOOL_RING_USE_TCP_DATA_SPLIT: capture for setting tcp_data_split */ enum ethtool_supported_ring_param { ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0), @@ -102,6 +103,7 @@ enum ethtool_supported_ring_param { ETHTOOL_RING_USE_TX_PUSH = BIT(2), ETHTOOL_RING_USE_RX_PUSH = BIT(3), ETHTOOL_RING_USE_TX_PUSH_BUF_LEN = BIT(4), + ETHTOOL_RING_USE_TCP_DATA_SPLIT = BIT(5), }; #define __ETH_RSS_HASH_BIT(bit) ((u32)1 << (bit)) -- cgit v1.2.3 From 0a149ab78ee220c75eef797abea7a29f4490e226 Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Tue, 12 Dec 2023 12:46:11 +0800 Subject: page_pool: transition to reference count management after page draining To support multiple users referencing the same fragment, 'pp_frag_count' is renamed to 'pp_ref_count', transitioning pp pages from fragment management to reference count management after draining based on the suggestion from [1]. The idea is that the concept of fragmenting exists before the page is drained, and all related functions retain their current names. However, once the page is drained, its management shifts to being governed by 'pp_ref_count'. Therefore, all functions associated with that lifecycle stage of a pp page are renamed. [1] http://lore.kernel.org/netdev/f71d9448-70c8-8793-dc9a-0eb48a570300@huawei.com Signed-off-by: Liang Chen Reviewed-by: Yunsheng Lin Reviewed-by: Ilias Apalodimas Reviewed-by: Mina Almasry Link: https://lore.kernel.org/r/20231212044614.42733-2-liangchen.linux@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/mm_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 957ce38768b2a..64e4572ef06de 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -125,7 +125,7 @@ struct page { struct page_pool *pp; unsigned long _pp_mapping_pad; unsigned long dma_addr; - atomic_long_t pp_frag_count; + atomic_long_t pp_ref_count; }; struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ -- cgit v1.2.3 From 6caa290684255991ffeebf228b2fd9e7e4da8f34 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 12 Dec 2023 23:02:56 -0800 Subject: Input: navpoint - convert to use GPIO descriptor The Navpoint driver uses a GPIO line, convert this to use a GPIO descriptor. There are no in-kernel users but out of tree users can easily be added or converted using a GPIO descriptor table as with numerous other drivers. Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20231129-descriptors-input-v1-1-9433162914a3@linaro.org Signed-off-by: Dmitry Torokhov --- include/linux/input/navpoint.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/input/navpoint.h b/include/linux/input/navpoint.h index d464ffb4db52b..5192ae3f5ec1b 100644 --- a/include/linux/input/navpoint.h +++ b/include/linux/input/navpoint.h @@ -5,5 +5,4 @@ struct navpoint_platform_data { int port; /* PXA SSP port for pxa_ssp_request() */ - int gpio; /* GPIO for power on/off */ }; -- cgit v1.2.3 From e53c18da99c75f080bd99436c57824f2ab657f03 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 12 Dec 2023 23:06:55 -0800 Subject: Input: omap-keypad - drop optional GPIO support The driver supports passing some GPIO lines for rows and columns through the driver data, but there is no in-kernel user of this. Further the use seems convoluted because the GPIO lines are unused in the driver, then explicitly free:ed when removing it without being requested when probing it, which is assymetric and just a recepie for disaster. Remove the support for these unused GPIOs, if need be support can be reestablished in an organized fashion using GPIO descriptors. Signed-off-by: Linus Walleij Reviewed-by: Tony Lindgren Link: https://lore.kernel.org/r/20231129-descriptors-input-v1-3-9433162914a3@linaro.org Signed-off-by: Dmitry Torokhov --- include/linux/platform_data/keypad-omap.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/keypad-omap.h b/include/linux/platform_data/keypad-omap.h index 3e7c64c854f4c..f3f1311cdf3aa 100644 --- a/include/linux/platform_data/keypad-omap.h +++ b/include/linux/platform_data/keypad-omap.h @@ -19,9 +19,6 @@ struct omap_kp_platform_data { bool rep; unsigned long delay; bool dbounce; - /* specific to OMAP242x*/ - unsigned int *row_gpios; - unsigned int *col_gpios; }; /* Group (0..3) -- when multiple keys are pressed, only the -- cgit v1.2.3 From 7395de647e87476f5b5d2f9a9fe80cee86b4e7cc Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 12 Dec 2023 23:04:47 -0800 Subject: Input: as5011 - convert to GPIO descriptor This driver does not have any in-tree users but is passing a legacy GPIO number through platform data. Convert it to use a GPIO descriptor, new users or outoftree users can easily be implemented using GPIO descriptor tables or software nodes. Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20231129-descriptors-input-v1-4-9433162914a3@linaro.org Signed-off-by: Dmitry Torokhov --- include/linux/input/as5011.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/input/as5011.h b/include/linux/input/as5011.h index 5fba52a56cd61..5705d5de3aeae 100644 --- a/include/linux/input/as5011.h +++ b/include/linux/input/as5011.h @@ -7,7 +7,6 @@ */ struct as5011_platform_data { - unsigned int button_gpio; unsigned int axis_irq; /* irq number */ unsigned long axis_irqflags; char xp, xn; /* threshold for x axis */ -- cgit v1.2.3 From 6ab3d50b106c9aea123a80551a6c9deace83b914 Mon Sep 17 00:00:00 2001 From: Qiang Yu Date: Tue, 7 Nov 2023 16:14:49 +0800 Subject: bus: mhi: host: Add a separate timeout parameter for waiting ready Some devices(eg. SDX75) take longer than expected (default, 8 seconds) to set ready after reboot. Hence add optional ready timeout parameter and pass the appropriate timeout value to mhi_poll_reg_field() to wait enough for device ready as part of power up sequence. Signed-off-by: Qiang Yu Reviewed-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/1699344890-87076-2-git-send-email-quic_qianyu@quicinc.com Signed-off-by: Manivannan Sadhasivam --- include/linux/mhi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 039943ec4d4e7..d0f9b522f328b 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -266,6 +266,7 @@ struct mhi_event_config { * struct mhi_controller_config - Root MHI controller configuration * @max_channels: Maximum number of channels supported * @timeout_ms: Timeout value for operations. 0 means use default + * @ready_timeout_ms: Timeout value for waiting device to be ready (optional) * @buf_len: Size of automatically allocated buffers. 0 means use default * @num_channels: Number of channels defined in @ch_cfg * @ch_cfg: Array of defined channels @@ -277,6 +278,7 @@ struct mhi_event_config { struct mhi_controller_config { u32 max_channels; u32 timeout_ms; + u32 ready_timeout_ms; u32 buf_len; u32 num_channels; const struct mhi_channel_config *ch_cfg; @@ -330,6 +332,7 @@ struct mhi_controller_config { * @pm_mutex: Mutex for suspend/resume operation * @pm_lock: Lock for protecting MHI power management state * @timeout_ms: Timeout in ms for state transitions + * @ready_timeout_ms: Timeout in ms for waiting device to be ready (optional) * @pm_state: MHI power management state * @db_access: DB access states * @ee: MHI device execution environment @@ -419,6 +422,7 @@ struct mhi_controller { struct mutex pm_mutex; rwlock_t pm_lock; u32 timeout_ms; + u32 ready_timeout_ms; u32 pm_state; u32 db_access; enum mhi_ee_type ee; -- cgit v1.2.3 From fb6e30a72539ce28c1323aef4190d35aac106f6f Mon Sep 17 00:00:00 2001 From: Ahmed Zaki Date: Tue, 12 Dec 2023 17:33:14 -0700 Subject: net: ethtool: pass a pointer to parameters to get/set_rxfh ethtool ops The get/set_rxfh ethtool ops currently takes the rxfh (RSS) parameters as direct function arguments. This will force us to change the API (and all drivers' functions) every time some new parameters are added. This is part 1/2 of the fix, as suggested in [1]: - First simplify the code by always providing a pointer to all params (indir, key and func); the fact that some of them may be NULL seems like a weird historic thing or a premature optimization. It will simplify the drivers if all pointers are always present. - Then make the functions take a dev pointer, and a pointer to a single struct wrapping all arguments. The set_* should also take an extack. Link: https://lore.kernel.org/netdev/20231121152906.2dd5f487@kernel.org/ [1] Suggested-by: Jakub Kicinski Suggested-by: Jacob Keller Signed-off-by: Ahmed Zaki Link: https://lore.kernel.org/r/20231213003321.605376-2-ahmed.zaki@intel.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 67b30940234be..3ab2b6a90419c 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -596,6 +596,28 @@ struct ethtool_mm_stats { u64 MACMergeHoldCount; }; +/** + * struct ethtool_rxfh_param - RXFH (RSS) parameters + * @hfunc: Defines the current RSS hash function used by HW (or to be set to). + * Valid values are one of the %ETH_RSS_HASH_*. + * @indir_size: On SET, the array size of the user buffer for the + * indirection table, which may be zero, or + * %ETH_RXFH_INDIR_NO_CHANGE. On GET (read from the driver), + * the array size of the hardware indirection table. + * @indir: The indirection table of size @indir_size entries. + * @key_size: On SET, the array size of the user buffer for the hash key, + * which may be zero. On GET (read from the driver), the size of the + * hardware hash key. + * @key: The hash key of size @key_size bytes. + */ +struct ethtool_rxfh_param { + u8 hfunc; + u32 indir_size; + u32 *indir; + u32 key_size; + u8 *key; +}; + /** * struct ethtool_ops - optional netdev operations * @cap_link_lanes_supported: indicates if the driver supports lanes @@ -846,14 +868,14 @@ struct ethtool_ops { int (*reset)(struct net_device *, u32 *); u32 (*get_rxfh_key_size)(struct net_device *); u32 (*get_rxfh_indir_size)(struct net_device *); - int (*get_rxfh)(struct net_device *, u32 *indir, u8 *key, - u8 *hfunc); - int (*set_rxfh)(struct net_device *, const u32 *indir, - const u8 *key, const u8 hfunc); - int (*get_rxfh_context)(struct net_device *, u32 *indir, u8 *key, - u8 *hfunc, u32 rss_context); - int (*set_rxfh_context)(struct net_device *, const u32 *indir, - const u8 *key, const u8 hfunc, + int (*get_rxfh)(struct net_device *, struct ethtool_rxfh_param *); + int (*set_rxfh)(struct net_device *, struct ethtool_rxfh_param *, + struct netlink_ext_ack *extack); + int (*get_rxfh_context)(struct net_device *, + struct ethtool_rxfh_param *, + u32 rss_context); + int (*set_rxfh_context)(struct net_device *, + struct ethtool_rxfh_param *, u32 *rss_context, bool delete); void (*get_channels)(struct net_device *, struct ethtool_channels *); int (*set_channels)(struct net_device *, struct ethtool_channels *); -- cgit v1.2.3 From dcd8dbf9e734eb334113ea43186c1c26e9f497bb Mon Sep 17 00:00:00 2001 From: Ahmed Zaki Date: Tue, 12 Dec 2023 17:33:15 -0700 Subject: net: ethtool: get rid of get/set_rxfh_context functions Add the RSS context parameters to struct ethtool_rxfh_param and use the get/set_rxfh to handle the RSS contexts as well. This is part 2/2 of the fix suggested in [1]: - Add a rss_context member to the argument struct and a capability like cap_link_lanes_supported to indicate whether driver supports rss contexts, then you can remove *et_rxfh_context functions, and instead call *et_rxfh() with a non-zero rss_context. Link: https://lore.kernel.org/netdev/20231121152906.2dd5f487@kernel.org/ [1] CC: Jesse Brandeburg CC: Tony Nguyen CC: Marcin Wojtas CC: Russell King CC: Sunil Goutham CC: Geetha sowjanya CC: Subbaraya Sundeep CC: hariprasad CC: Saeed Mahameed CC: Leon Romanovsky CC: Edward Cree CC: Martin Habets Suggested-by: Jakub Kicinski Signed-off-by: Ahmed Zaki Link: https://lore.kernel.org/r/20231213003321.605376-3-ahmed.zaki@intel.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 3ab2b6a90419c..66fe254c3e516 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -609,6 +609,12 @@ struct ethtool_mm_stats { * which may be zero. On GET (read from the driver), the size of the * hardware hash key. * @key: The hash key of size @key_size bytes. + * @rss_context: RSS context identifier. Context 0 is the default for normal + * traffic; other contexts can be referenced as the destination for RX flow + * classification rules. On SET, %ETH_RXFH_CONTEXT_ALLOC is used + * to allocate a new RSS context; on return this field will + * contain the ID of the newly allocated context. + * @rss_delete: Set to non-ZERO to remove the @rss_context context. */ struct ethtool_rxfh_param { u8 hfunc; @@ -616,12 +622,16 @@ struct ethtool_rxfh_param { u32 *indir; u32 key_size; u8 *key; + u32 rss_context; + u8 rss_delete; }; /** * struct ethtool_ops - optional netdev operations * @cap_link_lanes_supported: indicates if the driver supports lanes * parameter. + * @cap_rss_ctx_supported: indicates if the driver supports RSS + * contexts. * @supported_coalesce_params: supported types of interrupt coalescing. * @supported_ring_params: supported ring params. * @get_drvinfo: Report driver/device information. Modern drivers no @@ -718,15 +728,6 @@ struct ethtool_rxfh_param { * will remain unchanged. * Returns a negative error code or zero. An error code must be returned * if at least one unsupported change was requested. - * @get_rxfh_context: Get the contents of the RX flow hash indirection table, - * hash key, and/or hash function assiciated to the given rss context. - * Returns a negative error code or zero. - * @set_rxfh_context: Create, remove and configure RSS contexts. Allows setting - * the contents of the RX flow hash indirection table, hash key, and/or - * hash function associated to the given context. Arguments which are set - * to %NULL or zero will remain unchanged. - * Returns a negative error code or zero. An error code must be returned - * if at least one unsupported change was requested. * @get_channels: Get number of channels. * @set_channels: Set number of channels. Returns a negative error code or * zero. @@ -809,6 +810,7 @@ struct ethtool_rxfh_param { */ struct ethtool_ops { u32 cap_link_lanes_supported:1; + u32 cap_rss_ctx_supported:1; u32 supported_coalesce_params; u32 supported_ring_params; void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); @@ -871,12 +873,6 @@ struct ethtool_ops { int (*get_rxfh)(struct net_device *, struct ethtool_rxfh_param *); int (*set_rxfh)(struct net_device *, struct ethtool_rxfh_param *, struct netlink_ext_ack *extack); - int (*get_rxfh_context)(struct net_device *, - struct ethtool_rxfh_param *, - u32 rss_context); - int (*set_rxfh_context)(struct net_device *, - struct ethtool_rxfh_param *, - u32 *rss_context, bool delete); void (*get_channels)(struct net_device *, struct ethtool_channels *); int (*set_channels)(struct net_device *, struct ethtool_channels *); int (*get_dump_flag)(struct net_device *, struct ethtool_dump *); -- cgit v1.2.3 From 13e59344fb9d3c9d3acd138ae320b5b67b658694 Mon Sep 17 00:00:00 2001 From: Ahmed Zaki Date: Tue, 12 Dec 2023 17:33:16 -0700 Subject: net: ethtool: add support for symmetric-xor RSS hash Symmetric RSS hash functions are beneficial in applications that monitor both Tx and Rx packets of the same flow (IDS, software firewalls, ..etc). Getting all traffic of the same flow on the same RX queue results in higher CPU cache efficiency. A NIC that supports "symmetric-xor" can achieve this RSS hash symmetry by XORing the source and destination fields and pass the values to the RSS hash algorithm. The user may request RSS hash symmetry for a specific algorithm, via: # ethtool -X eth0 hfunc symmetric-xor or turn symmetry off (asymmetric) by: # ethtool -X eth0 hfunc The specific fields for each flow type should then be specified as usual via: # ethtool -N|-U eth0 rx-flow-hash s|d|f|n Reviewed-by: Wojciech Drewek Signed-off-by: Ahmed Zaki Link: https://lore.kernel.org/r/20231213003321.605376-4-ahmed.zaki@intel.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 66fe254c3e516..cfcd952a1d4f1 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -615,6 +615,8 @@ struct ethtool_mm_stats { * to allocate a new RSS context; on return this field will * contain the ID of the newly allocated context. * @rss_delete: Set to non-ZERO to remove the @rss_context context. + * @input_xfrm: Defines how the input data is transformed. Valid values are one + * of %RXH_XFRM_*. */ struct ethtool_rxfh_param { u8 hfunc; @@ -624,6 +626,7 @@ struct ethtool_rxfh_param { u8 *key; u32 rss_context; u8 rss_delete; + u8 input_xfrm; }; /** @@ -632,6 +635,8 @@ struct ethtool_rxfh_param { * parameter. * @cap_rss_ctx_supported: indicates if the driver supports RSS * contexts. + * @cap_rss_sym_xor_supported: indicates if the driver supports symmetric-xor + * RSS. * @supported_coalesce_params: supported types of interrupt coalescing. * @supported_ring_params: supported ring params. * @get_drvinfo: Report driver/device information. Modern drivers no @@ -811,6 +816,7 @@ struct ethtool_rxfh_param { struct ethtool_ops { u32 cap_link_lanes_supported:1; u32 cap_rss_ctx_supported:1; + u32 cap_rss_sym_xor_supported:1; u32 supported_coalesce_params; u32 supported_ring_params; void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); -- cgit v1.2.3 From dc6e44c9d6d68e8aa5de78d15f43f93145719b72 Mon Sep 17 00:00:00 2001 From: Qi Zhang Date: Tue, 12 Dec 2023 17:33:18 -0700 Subject: ice: refactor RSS configuration Refactor the driver to use a communication data structure for RSS config. To do so we introduce the new ice_rss_hash_cfg struct, and then pass it as an argument to several functions. Also introduce enum ice_rss_cfg_hdr_type to specify a more granular and flexible RSS configuration: ICE_RSS_OUTER_HEADERS - take outer layer as RSS input set ICE_RSS_INNER_HEADERS - take inner layer as RSS input set ICE_RSS_INNER_HEADERS_W_OUTER_IPV4 - take inner layer as RSS input set for packet with outer IPV4 ICE_RSS_INNER_HEADERS_W_OUTER_IPV6 - take inner layer as RSS input set for packet with outer IPV6 ICE_RSS_ANY_HEADERS - try with outer first then inner (same as the behaviour without this change) Finally, move the virtchnl_rss_algorithm enum to be with the other RSS related structures in the virtchnl.h file. There should be no functional change due to this patch. Reviewed-by: Wojciech Drewek Signed-off-by: Qi Zhang Co-developed-by: Jesse Brandeburg Signed-off-by: Jesse Brandeburg Co-developed-by: Ahmed Zaki Signed-off-by: Ahmed Zaki Link: https://lore.kernel.org/r/20231213003321.605376-6-ahmed.zaki@intel.com Signed-off-by: Jakub Kicinski --- include/linux/avf/virtchnl.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 6b3acf15be5c2..b0e060cc79ac1 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -911,6 +911,14 @@ struct virtchnl_rss_hena { VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_rss_hena); +/* Type of RSS algorithm */ +enum virtchnl_rss_algorithm { + VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC = 0, + VIRTCHNL_RSS_ALG_R_ASYMMETRIC = 1, + VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC = 2, + VIRTCHNL_RSS_ALG_XOR_SYMMETRIC = 3, +}; + /* VIRTCHNL_OP_ENABLE_CHANNELS * VIRTCHNL_OP_DISABLE_CHANNELS * VF sends these messages to enable or disable channels based on @@ -1095,14 +1103,6 @@ enum virtchnl_vfr_states { VIRTCHNL_VFR_VFACTIVE, }; -/* Type of RSS algorithm */ -enum virtchnl_rss_algorithm { - VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC = 0, - VIRTCHNL_RSS_ALG_R_ASYMMETRIC = 1, - VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC = 2, - VIRTCHNL_RSS_ALG_XOR_SYMMETRIC = 3, -}; - #define VIRTCHNL_MAX_NUM_PROTO_HDRS 32 #define PROTO_HDR_SHIFT 5 #define PROTO_HDR_FIELD_START(proto_hdr_type) ((proto_hdr_type) << PROTO_HDR_SHIFT) -- cgit v1.2.3 From 4a3de3fb0eb6897488dd510006abd9673f1fb34c Mon Sep 17 00:00:00 2001 From: Ahmed Zaki Date: Tue, 12 Dec 2023 17:33:21 -0700 Subject: iavf: enable symmetric-xor RSS for Toeplitz hash function Allow the user to set the symmetric Toeplitz hash function via: # ethtool -X eth0 hfunc toeplitz symmetric-xor The driver will reject any new RSS configuration if a field other than (IP src/dst and L4 src/dst ports) is requested for hashing. The symmetric RSS will not be supported on PFs not advertising the ADV RSS Offload flag (ADV_RSS_SUPPORT()), for example the E700 series (i40e). Reviewed-by: Madhu Chittim Signed-off-by: Ahmed Zaki Link: https://lore.kernel.org/r/20231213003321.605376-9-ahmed.zaki@intel.com Signed-off-by: Jakub Kicinski --- include/linux/avf/virtchnl.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index b0e060cc79ac1..a44d9dc7e3eb6 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -118,6 +118,7 @@ enum virtchnl_ops { VIRTCHNL_OP_GET_STATS = 15, VIRTCHNL_OP_RSVD = 16, VIRTCHNL_OP_EVENT = 17, /* must ALWAYS be 17 */ + VIRTCHNL_OP_CONFIG_RSS_HFUNC = 18, /* opcode 19 is reserved */ VIRTCHNL_OP_IWARP = 20, /* advanced opcode */ VIRTCHNL_OP_RDMA = VIRTCHNL_OP_IWARP, @@ -919,6 +920,21 @@ enum virtchnl_rss_algorithm { VIRTCHNL_RSS_ALG_XOR_SYMMETRIC = 3, }; +/* VIRTCHNL_OP_CONFIG_RSS_HFUNC + * VF sends this message to configure the RSS hash function. Only supported + * if both PF and VF drivers set the VIRTCHNL_VF_OFFLOAD_RSS_PF bit during + * configuration negotiation. + * The hash function is initialized to VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC + * by the PF. + */ +struct virtchnl_rss_hfunc { + u16 vsi_id; + u16 rss_algorithm; /* enum virtchnl_rss_algorithm */ + u32 reserved; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_rss_hfunc); + /* VIRTCHNL_OP_ENABLE_CHANNELS * VIRTCHNL_OP_DISABLE_CHANNELS * VF sends these messages to enable or disable channels based on @@ -1542,6 +1558,9 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode, vrl->lut_entries); } break; + case VIRTCHNL_OP_CONFIG_RSS_HFUNC: + valid_len = sizeof(struct virtchnl_rss_hfunc); + break; case VIRTCHNL_OP_GET_RSS_HENA_CAPS: break; case VIRTCHNL_OP_SET_RSS_HENA: -- cgit v1.2.3 From 62210a26cd4f8ad52683a71c0226dfe85de1144d Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Wed, 18 Oct 2023 17:58:12 +0530 Subject: bus: mhi: ep: Use slab allocator where applicable Use slab allocator for allocating the memory for objects used frequently and are of fixed size. This reduces the overheard associated with kmalloc(). Suggested-by: Alex Elder Link: https://lore.kernel.org/r/20231018122812.47261-1-manivannan.sadhasivam@linaro.org Signed-off-by: Manivannan Sadhasivam --- include/linux/mhi_ep.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h index f198a8ac7ee72..ce85d42b685d6 100644 --- a/include/linux/mhi_ep.h +++ b/include/linux/mhi_ep.h @@ -128,6 +128,9 @@ struct mhi_ep_cntrl { struct work_struct reset_work; struct work_struct cmd_ring_work; struct work_struct ch_ring_work; + struct kmem_cache *ring_item_cache; + struct kmem_cache *ev_ring_el_cache; + struct kmem_cache *tre_buf_cache; void (*raise_irq)(struct mhi_ep_cntrl *mhi_cntrl, u32 vector); int (*alloc_map)(struct mhi_ep_cntrl *mhi_cntrl, u64 pci_addr, phys_addr_t *phys_ptr, -- cgit v1.2.3 From b08ded2ef2e98768d5ee5f71da8fe768b1f7774b Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 17 Aug 2023 23:24:52 +0530 Subject: bus: mhi: ep: Pass mhi_ep_buf_info struct to read/write APIs In the preparation of DMA async support, let's pass the parameters to read_from_host() and write_to_host() APIs using mhi_ep_buf_info structure. No functional change. Signed-off-by: Manivannan Sadhasivam --- include/linux/mhi_ep.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h index ce85d42b685d6..96f3a133540db 100644 --- a/include/linux/mhi_ep.h +++ b/include/linux/mhi_ep.h @@ -49,6 +49,18 @@ struct mhi_ep_db_info { u32 status; }; +/** + * struct mhi_ep_buf_info - MHI Endpoint transfer buffer info + * @dev_addr: Address of the buffer in endpoint + * @host_addr: Address of the bufffer in host + * @size: Size of the buffer + */ +struct mhi_ep_buf_info { + void *dev_addr; + u64 host_addr; + size_t size; +}; + /** * struct mhi_ep_cntrl - MHI Endpoint controller structure * @cntrl_dev: Pointer to the struct device of physical bus acting as the MHI @@ -137,8 +149,8 @@ struct mhi_ep_cntrl { void __iomem **virt, size_t size); void (*unmap_free)(struct mhi_ep_cntrl *mhi_cntrl, u64 pci_addr, phys_addr_t phys, void __iomem *virt, size_t size); - int (*read_from_host)(struct mhi_ep_cntrl *mhi_cntrl, u64 from, void *to, size_t size); - int (*write_to_host)(struct mhi_ep_cntrl *mhi_cntrl, void *from, u64 to, size_t size); + int (*read_from_host)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info); + int (*write_to_host)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info); enum mhi_state mhi_state; -- cgit v1.2.3 From 927105244f8bc48e6841826a5644c6a961e03b5d Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Mon, 27 Nov 2023 13:57:37 +0530 Subject: bus: mhi: ep: Rename read_from_host() and write_to_host() APIs In the preparation for adding async API support, let's rename the existing APIs to read_sync() and write_sync() to make it explicit that these APIs are used for synchronous read/write. Signed-off-by: Manivannan Sadhasivam --- include/linux/mhi_ep.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h index 96f3a133540db..b96b543bf2f65 100644 --- a/include/linux/mhi_ep.h +++ b/include/linux/mhi_ep.h @@ -94,8 +94,8 @@ struct mhi_ep_buf_info { * @raise_irq: CB function for raising IRQ to the host * @alloc_map: CB function for allocating memory in endpoint for storing host context and mapping it * @unmap_free: CB function to unmap and free the allocated memory in endpoint for storing host context - * @read_from_host: CB function for reading from host memory from endpoint - * @write_to_host: CB function for writing to host memory from endpoint + * @read_sync: CB function for reading from host memory synchronously + * @write_sync: CB function for writing to host memory synchronously * @mhi_state: MHI Endpoint state * @max_chan: Maximum channels supported by the endpoint controller * @mru: MRU (Maximum Receive Unit) value of the endpoint controller @@ -149,8 +149,8 @@ struct mhi_ep_cntrl { void __iomem **virt, size_t size); void (*unmap_free)(struct mhi_ep_cntrl *mhi_cntrl, u64 pci_addr, phys_addr_t phys, void __iomem *virt, size_t size); - int (*read_from_host)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info); - int (*write_to_host)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info); + int (*read_sync)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info); + int (*write_sync)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info); enum mhi_state mhi_state; -- cgit v1.2.3 From 8b786ed8fb089e347af21d13ba5677325fcd4cd8 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Mon, 27 Nov 2023 15:35:50 +0530 Subject: bus: mhi: ep: Introduce async read/write callbacks These callbacks can be implemented by the controller drivers to perform async read/write operation that increases the throughput. For aiding the async operation, a completion callback is also introduced. Signed-off-by: Manivannan Sadhasivam --- include/linux/mhi_ep.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h index b96b543bf2f65..14c6e8d3f5736 100644 --- a/include/linux/mhi_ep.h +++ b/include/linux/mhi_ep.h @@ -54,11 +54,16 @@ struct mhi_ep_db_info { * @dev_addr: Address of the buffer in endpoint * @host_addr: Address of the bufffer in host * @size: Size of the buffer + * @cb: Callback to be executed by controller drivers after transfer completion (async) + * @cb_buf: Opaque buffer to be passed to the callback */ struct mhi_ep_buf_info { void *dev_addr; u64 host_addr; size_t size; + + void (*cb)(struct mhi_ep_buf_info *buf_info); + void *cb_buf; }; /** @@ -96,6 +101,8 @@ struct mhi_ep_buf_info { * @unmap_free: CB function to unmap and free the allocated memory in endpoint for storing host context * @read_sync: CB function for reading from host memory synchronously * @write_sync: CB function for writing to host memory synchronously + * @read_async: CB function for reading from host memory asynchronously + * @write_async: CB function for writing to host memory asynchronously * @mhi_state: MHI Endpoint state * @max_chan: Maximum channels supported by the endpoint controller * @mru: MRU (Maximum Receive Unit) value of the endpoint controller @@ -151,6 +158,8 @@ struct mhi_ep_cntrl { void __iomem *virt, size_t size); int (*read_sync)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info); int (*write_sync)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info); + int (*read_async)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info); + int (*write_async)(struct mhi_ep_cntrl *mhi_cntrl, struct mhi_ep_buf_info *buf_info); enum mhi_state mhi_state; -- cgit v1.2.3 From b4c2bea8ceaa50cd42a8f73667389d801a3ecf2d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 25 Oct 2023 16:02:03 +0200 Subject: add listmount(2) syscall Add way to query the children of a particular mount. This is a more flexible way to iterate the mount tree than having to parse /proc/self/mountinfo. Lookup the mount by the new 64bit mount ID. If a mount needs to be queried based on path, then statx(2) can be used to first query the mount ID belonging to the path. Return an array of new (64bit) mount ID's. Without privileges only mounts are listed which are reachable from the task's root. Folded into this patch are several later improvements. Keeping them separate would make the history pointlessly confusing: * Recursive listing of mounts is the default now (cf. [1]). * Remove explicit LISTMOUNT_UNREACHABLE flag (cf. [1]) and fail if mount is unreachable from current root. This also makes permission checking consistent with statmount() (cf. [3]). * Start listing mounts in unique mount ID order (cf. [2]) to allow continuing listmount() from a midpoint. * Allow to continue listmount(). The @request_mask parameter is renamed and to @param to be usable by both statmount() and listmount(). If @param is set to a mount id then listmount() will continue listing mounts from that id on. This allows listing mounts in multiple listmount invocations without having to resize the buffer. If @param is zero then the listing starts from the beginning (cf. [4]). * Don't return EOVERFLOW, instead return the buffer size which allows to detect a full buffer as well (cf. [4]). Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20231025140205.3586473-6-mszeredi@redhat.com Reviewed-by: Ian Kent Link: https://lore.kernel.org/r/20231128160337.29094-2-mszeredi@redhat.com [1] (folded) Link: https://lore.kernel.org/r/20231128160337.29094-3-mszeredi@redhat.com [2] (folded) Link: https://lore.kernel.org/r/20231128160337.29094-4-mszeredi@redhat.com [3] (folded) Link: https://lore.kernel.org/r/20231128160337.29094-5-mszeredi@redhat.com [4] (folded) [Christian Brauner : various smaller fixes] Signed-off-by: Christian Brauner --- include/linux/syscalls.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 530ca9adf5f18..2d6d3e76e3f75 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -412,6 +412,9 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, asmlinkage long sys_statmount(const struct mnt_id_req __user *req, struct statmount __user *buf, size_t bufsize, unsigned int flags); +asmlinkage long sys_listmount(const struct mnt_id_req __user *req, + u64 __user *buf, size_t bufsize, + unsigned int flags); asmlinkage long sys_truncate(const char __user *path, long length); asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length); #if BITS_PER_LONG == 32 -- cgit v1.2.3 From bf873a800ac3234eba991603a450eaa517d27022 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 12 Dec 2023 20:35:11 -0800 Subject: net: skbuff: fix spelling errors Correct spelling as reported by codespell. Signed-off-by: Randy Dunlap Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20231213043511.10357-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b370eb8d70f7f..7ce38874dbd1f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1069,7 +1069,7 @@ struct sk_buff { refcount_t users; #ifdef CONFIG_SKB_EXTENSIONS - /* only useable after checking ->active_extensions != 0 */ + /* only usable after checking ->active_extensions != 0 */ struct skb_ext *extensions; #endif }; @@ -3311,7 +3311,7 @@ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask, unsigned int order) { /* This piece of code contains several assumptions. - * 1. This is for device Rx, therefor a cold page is preferred. + * 1. This is for device Rx, therefore a cold page is preferred. * 2. The expectation is the user wants a compound page. * 3. If requesting a order 0 page it will not be compound * due to the check to see if order has a value in prep_new_page @@ -4247,7 +4247,7 @@ static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, { const void *a = skb_metadata_end(skb_a); const void *b = skb_metadata_end(skb_b); - /* Using more efficient varaiant than plain call to memcmp(). */ + /* Using more efficient variant than plain call to memcmp(). */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 diffs = 0; -- cgit v1.2.3 From ee08acb58fe47fc3bc2c137965985cdb1df40b35 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 2 Nov 2023 20:33:18 +0530 Subject: bus: mhi: ep: Add support for async DMA write operation In order to optimize the data transfer, let's use the async DMA operation for writing (queuing) data to the host. In the async path, the completion event for the transfer ring will only be sent to the host when the controller driver notifies the MHI stack of the actual transfer completion using the callback (mhi_ep_skb_completion) supplied in "struct mhi_ep_buf_info". Also to accommodate the async operation, the transfer ring read offset (ring->rd_offset) is cached in the "struct mhi_ep_chan" and updated locally to let the stack queue further ring items to the controller driver. But the actual read offset of the transfer ring will only be updated in the completion callback. Signed-off-by: Manivannan Sadhasivam --- include/linux/mhi_ep.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mhi_ep.h b/include/linux/mhi_ep.h index 14c6e8d3f5736..11bf3212f7822 100644 --- a/include/linux/mhi_ep.h +++ b/include/linux/mhi_ep.h @@ -51,16 +51,20 @@ struct mhi_ep_db_info { /** * struct mhi_ep_buf_info - MHI Endpoint transfer buffer info + * @mhi_dev: MHI device associated with this buffer * @dev_addr: Address of the buffer in endpoint * @host_addr: Address of the bufffer in host * @size: Size of the buffer + * @code: Transfer completion code * @cb: Callback to be executed by controller drivers after transfer completion (async) * @cb_buf: Opaque buffer to be passed to the callback */ struct mhi_ep_buf_info { + struct mhi_ep_device *mhi_dev; void *dev_addr; u64 host_addr; size_t size; + int code; void (*cb)(struct mhi_ep_buf_info *buf_info); void *cb_buf; -- cgit v1.2.3 From 0fe1798968115488c0c02f4633032a015b1faf97 Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Thu, 14 Dec 2023 15:52:29 +0300 Subject: virtio/vsock: send credit update during setting SO_RCVLOWAT Send credit update message when SO_RCVLOWAT is updated and it is bigger than number of bytes in rx queue. It is needed, because 'poll()' will wait until number of bytes in rx queue will be not smaller than O_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup for tx/rx is possible: sender waits for free space and receiver is waiting data in 'poll()'. Rename 'set_rcvlowat' callback to 'notify_set_rcvlowat' and set 'sk->sk_rcvlowat' only in one place (i.e. 'vsock_set_rcvlowat'), so the transport doesn't need to do it. Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages") Signed-off-by: Arseniy Krasnov Reviewed-by: Stefano Garzarella Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index ebb3ce63d64da..c82089dee0c83 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit); void virtio_transport_deliver_tap_pkt(struct sk_buff *skb); int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list); int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor); +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val); #endif /* _LINUX_VIRTIO_VSOCK_H */ -- cgit v1.2.3 From 4ad4c1f394b84f9941a10aa8aaf11102478a390b Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 24 Nov 2023 18:10:03 +0000 Subject: dma-mapping: don't store redundant offsets A bus_dma_region necessarily stores both CPU and DMA base addresses for a range, so there's no need to also store the difference between them. Signed-off-by: Robin Murphy Acked-by: Rob Herring Signed-off-by: Christoph Hellwig --- include/linux/dma-direct.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 18aade195884d..3eb3589ff43e9 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -21,7 +21,6 @@ struct bus_dma_region { phys_addr_t cpu_start; dma_addr_t dma_start; u64 size; - u64 offset; }; static inline dma_addr_t translate_phys_to_dma(struct device *dev, @@ -29,9 +28,12 @@ static inline dma_addr_t translate_phys_to_dma(struct device *dev, { const struct bus_dma_region *m; - for (m = dev->dma_range_map; m->size; m++) - if (paddr >= m->cpu_start && paddr - m->cpu_start < m->size) - return (dma_addr_t)paddr - m->offset; + for (m = dev->dma_range_map; m->size; m++) { + u64 offset = paddr - m->cpu_start; + + if (paddr >= m->cpu_start && offset < m->size) + return m->dma_start + offset; + } /* make sure dma_capable fails when no translation is available */ return DMA_MAPPING_ERROR; @@ -42,9 +44,12 @@ static inline phys_addr_t translate_dma_to_phys(struct device *dev, { const struct bus_dma_region *m; - for (m = dev->dma_range_map; m->size; m++) - if (dma_addr >= m->dma_start && dma_addr - m->dma_start < m->size) - return (phys_addr_t)dma_addr + m->offset; + for (m = dev->dma_range_map; m->size; m++) { + u64 offset = dma_addr - m->dma_start; + + if (dma_addr >= m->dma_start && offset < m->size) + return m->cpu_start + offset; + } return (phys_addr_t)-1; } -- cgit v1.2.3 From 134c6eaa6087d78c0e289931ca15ae7a5007670d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 13 Dec 2023 15:02:35 -0800 Subject: driver core: Add a guard() definition for the device_lock() At present there are ~200 usages of device_lock() in the kernel. Some of those usages lead to "goto unlock;" patterns which have proven to be error prone. Define a "device" guard() definition to allow for those to be cleaned up and prevent new ones from appearing. Link: http://lore.kernel.org/r/657897453dda8_269bd29492@dwillia2-mobl3.amr.corp.intel.com.notmuch Link: http://lore.kernel.org/r/6577b0c2a02df_a04c5294bb@dwillia2-xfh.jf.intel.com.notmuch Cc: Vishal Verma Cc: Ira Weiny Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Cc: Andrew Morton Signed-off-by: Dan Williams Reviewed-by: Ira Weiny Reviewed-by: Dave Jiang Reviewed-by: Vishal Verma Link: https://lore.kernel.org/r/170250854466.1522182.17555361077409628655.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index d7a72a8749ea0..6c83294395ac0 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1007,6 +1007,8 @@ static inline void device_unlock(struct device *dev) mutex_unlock(&dev->mutex); } +DEFINE_GUARD(device, struct device *, device_lock(_T), device_unlock(_T)) + static inline void device_lock_assert(struct device *dev) { lockdep_assert_held(&dev->mutex); -- cgit v1.2.3 From 7f38b70042fcaa49219045bd1a9a2836e27a58ac Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Dec 2023 11:15:27 +0000 Subject: of: device: Export of_device_make_bus_id() This helper is really handy to create unique device names based on their device tree path, we may need it outside of the OF core (in the NVMEM subsystem) so let's export it. As this helper has nothing patform specific, let's move it to of/device.c instead of of/platform.c so we can add its prototype to of_device.h. Signed-off-by: Miquel Raynal Acked-by: Rob Herring Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20231215111536.316972-2-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/of_device.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_device.h b/include/linux/of_device.h index 2c7a3d4bc775b..a72661e47faa5 100644 --- a/include/linux/of_device.h +++ b/include/linux/of_device.h @@ -40,6 +40,9 @@ static inline int of_dma_configure(struct device *dev, { return of_dma_configure_id(dev, np, force_dma, NULL); } + +void of_device_make_bus_id(struct device *dev); + #else /* CONFIG_OF */ static inline int of_driver_match_device(struct device *dev, @@ -82,6 +85,9 @@ static inline int of_dma_configure(struct device *dev, { return 0; } + +static inline void of_device_make_bus_id(struct device *dev) {} + #endif /* CONFIG_OF */ #endif /* _LINUX_OF_DEVICE_H */ -- cgit v1.2.3 From 4a1a40233b4a9fc159a5c7a27dc34c5c7bc5be55 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Dec 2023 11:15:28 +0000 Subject: nvmem: Move of_nvmem_layout_get_container() in another header nvmem-consumer.h is included by consumer devices, extracting data from NVMEM devices whereas nvmem-provider.h is included by devices providing NVMEM content. The only users of of_nvmem_layout_get_container() outside of the core are layout drivers, so better move its prototype to nvmem-provider.h. While we do so, we also move the kdoc associated with the function to the header rather than the .c file. Signed-off-by: Miquel Raynal Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20231215111536.316972-3-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/nvmem-consumer.h | 7 ------- include/linux/nvmem-provider.h | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h index 6ec4b9743e25d..2d306fa13b1a8 100644 --- a/include/linux/nvmem-consumer.h +++ b/include/linux/nvmem-consumer.h @@ -247,7 +247,6 @@ struct nvmem_cell *of_nvmem_cell_get(struct device_node *np, const char *id); struct nvmem_device *of_nvmem_device_get(struct device_node *np, const char *name); -struct device_node *of_nvmem_layout_get_container(struct nvmem_device *nvmem); #else static inline struct nvmem_cell *of_nvmem_cell_get(struct device_node *np, const char *id) @@ -260,12 +259,6 @@ static inline struct nvmem_device *of_nvmem_device_get(struct device_node *np, { return ERR_PTR(-EOPNOTSUPP); } - -static inline struct device_node * -of_nvmem_layout_get_container(struct nvmem_device *nvmem) -{ - return NULL; -} #endif /* CONFIG_NVMEM && CONFIG_OF */ #endif /* ifndef _LINUX_NVMEM_CONSUMER_H */ diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index e3930835235ba..e5de21516387e 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -244,6 +244,27 @@ nvmem_layout_get_match_data(struct nvmem_device *nvmem, #endif /* CONFIG_NVMEM */ +#if IS_ENABLED(CONFIG_NVMEM) && IS_ENABLED(CONFIG_OF) + +/** + * of_nvmem_layout_get_container() - Get OF node of layout container + * + * @nvmem: nvmem device + * + * Return: a node pointer with refcount incremented or NULL if no + * container exists. Use of_node_put() on it when done. + */ +struct device_node *of_nvmem_layout_get_container(struct nvmem_device *nvmem); + +#else /* CONFIG_NVMEM && CONFIG_OF */ + +static inline struct device_node *of_nvmem_layout_get_container(struct nvmem_device *nvmem) +{ + return NULL; +} + +#endif /* CONFIG_NVMEM && CONFIG_OF */ + #define module_nvmem_layout_driver(__layout_driver) \ module_driver(__layout_driver, nvmem_layout_register, \ nvmem_layout_unregister) -- cgit v1.2.3 From 1b7c298a4ecbc28cc6ee94005734bff55eb83d22 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Dec 2023 11:15:30 +0000 Subject: nvmem: Simplify the ->add_cells() hook The layout entry is not used and will anyway be made useless by the new layout bus infrastructure coming next, so drop it. While at it, clarify the kdoc entry. Signed-off-by: Miquel Raynal Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20231215111536.316972-5-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/nvmem-provider.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index e5de21516387e..3939991b3c5f9 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -156,9 +156,8 @@ struct nvmem_cell_table { * * @name: Layout name. * @of_match_table: Open firmware match table. - * @add_cells: Will be called if a nvmem device is found which - * has this layout. The function will add layout - * specific cells with nvmem_add_one_cell(). + * @add_cells: Called to populate the layout using + * nvmem_add_one_cell(). * @fixup_cell_info: Will be called before a cell is added. Can be * used to modify the nvmem_cell_info. * @owner: Pointer to struct module. @@ -172,8 +171,7 @@ struct nvmem_cell_table { struct nvmem_layout { const char *name; const struct of_device_id *of_match_table; - int (*add_cells)(struct device *dev, struct nvmem_device *nvmem, - struct nvmem_layout *layout); + int (*add_cells)(struct device *dev, struct nvmem_device *nvmem); void (*fixup_cell_info)(struct nvmem_device *nvmem, struct nvmem_layout *layout, struct nvmem_cell_info *cell); -- cgit v1.2.3 From 1172460e716784ac7e1049a537bdca8edbf97360 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Dec 2023 11:15:31 +0000 Subject: nvmem: Move and rename ->fixup_cell_info() This hook is meant to be used by any provider and instantiating a layout just for this is useless. Let's instead move this hook to the nvmem device and add it to the config structure to be easily shared by the providers. While at moving this hook, rename it ->fixup_dt_cell_info() to clarify its main intended purpose. Signed-off-by: Miquel Raynal Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20231215111536.316972-6-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/nvmem-provider.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index 3939991b3c5f9..36415a602d9eb 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -83,6 +83,8 @@ struct nvmem_cell_info { * @cells: Optional array of pre-defined NVMEM cells. * @ncells: Number of elements in cells. * @add_legacy_fixed_of_cells: Read fixed NVMEM cells from old OF syntax. + * @fixup_dt_cell_info: Will be called before a cell is added. Can be + * used to modify the nvmem_cell_info. * @keepout: Optional array of keepout ranges (sorted ascending by start). * @nkeepout: Number of elements in the keepout array. * @type: Type of the nvmem storage @@ -113,6 +115,8 @@ struct nvmem_config { const struct nvmem_cell_info *cells; int ncells; bool add_legacy_fixed_of_cells; + void (*fixup_dt_cell_info)(struct nvmem_device *nvmem, + struct nvmem_cell_info *cell); const struct nvmem_keepout *keepout; unsigned int nkeepout; enum nvmem_type type; @@ -158,8 +162,6 @@ struct nvmem_cell_table { * @of_match_table: Open firmware match table. * @add_cells: Called to populate the layout using * nvmem_add_one_cell(). - * @fixup_cell_info: Will be called before a cell is added. Can be - * used to modify the nvmem_cell_info. * @owner: Pointer to struct module. * @node: List node. * @@ -172,9 +174,6 @@ struct nvmem_layout { const char *name; const struct of_device_id *of_match_table; int (*add_cells)(struct device *dev, struct nvmem_device *nvmem); - void (*fixup_cell_info)(struct nvmem_device *nvmem, - struct nvmem_layout *layout, - struct nvmem_cell_info *cell); /* private */ struct module *owner; -- cgit v1.2.3 From fc29fd821d9ac2ae3d32a722fac39ce874efb883 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Dec 2023 11:15:32 +0000 Subject: nvmem: core: Rework layouts to become regular devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current layout support was initially written without modules support in mind. When the requirement for module support rose, the existing base was improved to adopt modularization support, but kind of a design flaw was introduced. With the existing implementation, when a storage device registers into NVMEM, the core tries to hook a layout (if any) and populates its cells immediately. This means, if the hardware description expects a layout to be hooked up, but no driver was provided for that, the storage medium will fail to probe and try later from scratch. Even if we consider that the hardware description shall be correct, we could still probe the storage device (especially if it contains the rootfs). One way to overcome this situation is to consider the layouts as devices, and leverage the native notifier mechanism. When a new NVMEM device is registered, we can populate its nvmem-layout child, if any, and wait for the matching to be done in order to get the cells (the waiting can be easily done with the NVMEM notifiers). If the layout driver is compiled as a module, it should automatically be loaded. This way, there is no strong order to enforce, any NVMEM device creation or NVMEM layout driver insertion will be observed as a new event which may lead to the creation of additional cells, without disturbing the probes with costly (and sometimes endless) deferrals. In order to achieve that goal we create a new bus for the nvmem-layouts with minimal logic to match nvmem-layout devices with nvmem-layout drivers. All this infrastructure code is created in the layouts.c file. Signed-off-by: Miquel Raynal Tested-by: Rafał Miłecki Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20231215111536.316972-7-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/nvmem-provider.h | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index 36415a602d9eb..6fe65b35ea972 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -9,6 +9,7 @@ #ifndef _LINUX_NVMEM_PROVIDER_H #define _LINUX_NVMEM_PROVIDER_H +#include #include #include #include @@ -158,12 +159,11 @@ struct nvmem_cell_table { /** * struct nvmem_layout - NVMEM layout definitions * - * @name: Layout name. - * @of_match_table: Open firmware match table. - * @add_cells: Called to populate the layout using - * nvmem_add_one_cell(). - * @owner: Pointer to struct module. - * @node: List node. + * @dev: Device-model layout device. + * @nvmem: The underlying NVMEM device + * @add_cells: Will be called if a nvmem device is found which + * has this layout. The function will add layout + * specific cells with nvmem_add_one_cell(). * * A nvmem device can hold a well defined structure which can just be * evaluated during runtime. For example a TLV list, or a list of "name=val" @@ -171,13 +171,15 @@ struct nvmem_cell_table { * cells. */ struct nvmem_layout { - const char *name; - const struct of_device_id *of_match_table; + struct device dev; + struct nvmem_device *nvmem; int (*add_cells)(struct device *dev, struct nvmem_device *nvmem); +}; - /* private */ - struct module *owner; - struct list_head node; +struct nvmem_layout_driver { + struct device_driver driver; + int (*probe)(struct nvmem_layout *layout); + void (*remove)(struct nvmem_layout *layout); }; #if IS_ENABLED(CONFIG_NVMEM) @@ -194,11 +196,15 @@ void nvmem_del_cell_table(struct nvmem_cell_table *table); int nvmem_add_one_cell(struct nvmem_device *nvmem, const struct nvmem_cell_info *info); -int __nvmem_layout_register(struct nvmem_layout *layout, struct module *owner); -#define nvmem_layout_register(layout) \ - __nvmem_layout_register(layout, THIS_MODULE) +int nvmem_layout_register(struct nvmem_layout *layout); void nvmem_layout_unregister(struct nvmem_layout *layout); +int nvmem_layout_driver_register(struct nvmem_layout_driver *drv); +void nvmem_layout_driver_unregister(struct nvmem_layout_driver *drv); +#define module_nvmem_layout_driver(__nvmem_layout_driver) \ + module_driver(__nvmem_layout_driver, nvmem_layout_driver_register, \ + nvmem_layout_driver_unregister) + const void *nvmem_layout_get_match_data(struct nvmem_device *nvmem, struct nvmem_layout *layout); @@ -262,8 +268,4 @@ static inline struct device_node *of_nvmem_layout_get_container(struct nvmem_dev #endif /* CONFIG_NVMEM && CONFIG_OF */ -#define module_nvmem_layout_driver(__layout_driver) \ - module_driver(__layout_driver, nvmem_layout_register, \ - nvmem_layout_unregister) - #endif /* ifndef _LINUX_NVMEM_PROVIDER_H */ -- cgit v1.2.3 From 1f78c56007ba61b7b8c3f7dbb6787b6af116d3f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Thu, 7 Dec 2023 18:56:06 +0100 Subject: tty: serial: amba: Use linux/{bits,bitfield}.h macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The driver uses bit shifts and hexadecimal expressions to declare constants. Replace that with the BIT(), GENMASK() & FIELD_PREP_CONST() macros to clarify intent. include/linux/amba/serial.h gets included from arch/arm/include/debug/pl01x.S. Avoid includes and macro tricks for the four defines that are involved: UART01x_DR, UART01x_FR, UART01x_FR_TXFF and UART01x_FR_BUSY. Reviewed-by: Linus Walleij Reviewed-by: Ilpo Järvinen Signed-off-by: Théo Lebrun Link: https://lore.kernel.org/r/20231207-mbly-uart-v6-1-e384afa5e78c@bootlin.com Signed-off-by: Greg Kroah-Hartman --- include/linux/amba/serial.h | 251 +++++++++++++++++++++++--------------------- 1 file changed, 130 insertions(+), 121 deletions(-) (limited to 'include/linux') diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h index 27003ec52114c..9120de05ead08 100644 --- a/include/linux/amba/serial.h +++ b/include/linux/amba/serial.h @@ -10,6 +10,11 @@ #ifndef ASM_ARM_HARDWARE_SERIAL_AMBA_H #define ASM_ARM_HARDWARE_SERIAL_AMBA_H +#ifndef __ASSEMBLY__ +#include +#include +#endif + #include /* ------------------------------------------------------------------------------- @@ -70,138 +75,142 @@ #define ZX_UART011_ICR 0x4c #define ZX_UART011_DMACR 0x50 -#define UART011_DR_OE (1 << 11) -#define UART011_DR_BE (1 << 10) -#define UART011_DR_PE (1 << 9) -#define UART011_DR_FE (1 << 8) - -#define UART01x_RSR_OE 0x08 -#define UART01x_RSR_BE 0x04 -#define UART01x_RSR_PE 0x02 -#define UART01x_RSR_FE 0x01 - -#define UART011_FR_RI 0x100 -#define UART011_FR_TXFE 0x080 -#define UART011_FR_RXFF 0x040 -#define UART01x_FR_TXFF 0x020 -#define UART01x_FR_RXFE 0x010 -#define UART01x_FR_BUSY 0x008 -#define UART01x_FR_DCD 0x004 -#define UART01x_FR_DSR 0x002 -#define UART01x_FR_CTS 0x001 +#define UART011_DR_OE BIT(11) +#define UART011_DR_BE BIT(10) +#define UART011_DR_PE BIT(9) +#define UART011_DR_FE BIT(8) + +#define UART01x_RSR_OE BIT(3) +#define UART01x_RSR_BE BIT(2) +#define UART01x_RSR_PE BIT(1) +#define UART01x_RSR_FE BIT(0) + +#define UART011_FR_RI BIT(8) +#define UART011_FR_TXFE BIT(7) +#define UART011_FR_RXFF BIT(6) +#define UART01x_FR_TXFF (1 << 5) /* used in ASM */ +#define UART01x_FR_RXFE BIT(4) +#define UART01x_FR_BUSY (1 << 3) /* used in ASM */ +#define UART01x_FR_DCD BIT(2) +#define UART01x_FR_DSR BIT(1) +#define UART01x_FR_CTS BIT(0) #define UART01x_FR_TMSK (UART01x_FR_TXFF + UART01x_FR_BUSY) /* * Some bits of Flag Register on ZTE device have different position from * standard ones. */ -#define ZX_UART01x_FR_BUSY 0x100 -#define ZX_UART01x_FR_DSR 0x008 -#define ZX_UART01x_FR_CTS 0x002 -#define ZX_UART011_FR_RI 0x001 - -#define UART011_CR_CTSEN 0x8000 /* CTS hardware flow control */ -#define UART011_CR_RTSEN 0x4000 /* RTS hardware flow control */ -#define UART011_CR_OUT2 0x2000 /* OUT2 */ -#define UART011_CR_OUT1 0x1000 /* OUT1 */ -#define UART011_CR_RTS 0x0800 /* RTS */ -#define UART011_CR_DTR 0x0400 /* DTR */ -#define UART011_CR_RXE 0x0200 /* receive enable */ -#define UART011_CR_TXE 0x0100 /* transmit enable */ -#define UART011_CR_LBE 0x0080 /* loopback enable */ -#define UART010_CR_RTIE 0x0040 -#define UART010_CR_TIE 0x0020 -#define UART010_CR_RIE 0x0010 -#define UART010_CR_MSIE 0x0008 -#define ST_UART011_CR_OVSFACT 0x0008 /* Oversampling factor */ -#define UART01x_CR_IIRLP 0x0004 /* SIR low power mode */ -#define UART01x_CR_SIREN 0x0002 /* SIR enable */ -#define UART01x_CR_UARTEN 0x0001 /* UART enable */ - -#define UART011_LCRH_SPS 0x80 +#define ZX_UART01x_FR_BUSY BIT(8) +#define ZX_UART01x_FR_DSR BIT(3) +#define ZX_UART01x_FR_CTS BIT(1) +#define ZX_UART011_FR_RI BIT(0) + +#define UART011_CR_CTSEN BIT(15) /* CTS hardware flow control */ +#define UART011_CR_RTSEN BIT(14) /* RTS hardware flow control */ +#define UART011_CR_OUT2 BIT(13) /* OUT2 */ +#define UART011_CR_OUT1 BIT(12) /* OUT1 */ +#define UART011_CR_RTS BIT(11) /* RTS */ +#define UART011_CR_DTR BIT(10) /* DTR */ +#define UART011_CR_RXE BIT(9) /* receive enable */ +#define UART011_CR_TXE BIT(8) /* transmit enable */ +#define UART011_CR_LBE BIT(7) /* loopback enable */ +#define UART010_CR_RTIE BIT(6) +#define UART010_CR_TIE BIT(5) +#define UART010_CR_RIE BIT(4) +#define UART010_CR_MSIE BIT(3) +#define ST_UART011_CR_OVSFACT BIT(3) /* Oversampling factor */ +#define UART01x_CR_IIRLP BIT(2) /* SIR low power mode */ +#define UART01x_CR_SIREN BIT(1) /* SIR enable */ +#define UART01x_CR_UARTEN BIT(0) /* UART enable */ + +#define UART011_LCRH_SPS BIT(7) #define UART01x_LCRH_WLEN_8 0x60 #define UART01x_LCRH_WLEN_7 0x40 #define UART01x_LCRH_WLEN_6 0x20 #define UART01x_LCRH_WLEN_5 0x00 -#define UART01x_LCRH_FEN 0x10 -#define UART01x_LCRH_STP2 0x08 -#define UART01x_LCRH_EPS 0x04 -#define UART01x_LCRH_PEN 0x02 -#define UART01x_LCRH_BRK 0x01 - -#define ST_UART011_DMAWM_RX_1 (0 << 3) -#define ST_UART011_DMAWM_RX_2 (1 << 3) -#define ST_UART011_DMAWM_RX_4 (2 << 3) -#define ST_UART011_DMAWM_RX_8 (3 << 3) -#define ST_UART011_DMAWM_RX_16 (4 << 3) -#define ST_UART011_DMAWM_RX_32 (5 << 3) -#define ST_UART011_DMAWM_RX_48 (6 << 3) -#define ST_UART011_DMAWM_TX_1 0 -#define ST_UART011_DMAWM_TX_2 1 -#define ST_UART011_DMAWM_TX_4 2 -#define ST_UART011_DMAWM_TX_8 3 -#define ST_UART011_DMAWM_TX_16 4 -#define ST_UART011_DMAWM_TX_32 5 -#define ST_UART011_DMAWM_TX_48 6 - -#define UART010_IIR_RTIS 0x08 -#define UART010_IIR_TIS 0x04 -#define UART010_IIR_RIS 0x02 -#define UART010_IIR_MIS 0x01 - -#define UART011_IFLS_RX1_8 (0 << 3) -#define UART011_IFLS_RX2_8 (1 << 3) -#define UART011_IFLS_RX4_8 (2 << 3) -#define UART011_IFLS_RX6_8 (3 << 3) -#define UART011_IFLS_RX7_8 (4 << 3) -#define UART011_IFLS_TX1_8 (0 << 0) -#define UART011_IFLS_TX2_8 (1 << 0) -#define UART011_IFLS_TX4_8 (2 << 0) -#define UART011_IFLS_TX6_8 (3 << 0) -#define UART011_IFLS_TX7_8 (4 << 0) +#define UART01x_LCRH_FEN BIT(4) +#define UART01x_LCRH_STP2 BIT(3) +#define UART01x_LCRH_EPS BIT(2) +#define UART01x_LCRH_PEN BIT(1) +#define UART01x_LCRH_BRK BIT(0) + +#define ST_UART011_DMAWM_RX GENMASK(5, 3) +#define ST_UART011_DMAWM_RX_1 FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 0) +#define ST_UART011_DMAWM_RX_2 FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 1) +#define ST_UART011_DMAWM_RX_4 FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 2) +#define ST_UART011_DMAWM_RX_8 FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 3) +#define ST_UART011_DMAWM_RX_16 FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 4) +#define ST_UART011_DMAWM_RX_32 FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 5) +#define ST_UART011_DMAWM_RX_48 FIELD_PREP_CONST(ST_UART011_DMAWM_RX, 6) +#define ST_UART011_DMAWM_TX GENMASK(2, 0) +#define ST_UART011_DMAWM_TX_1 FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 0) +#define ST_UART011_DMAWM_TX_2 FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 1) +#define ST_UART011_DMAWM_TX_4 FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 2) +#define ST_UART011_DMAWM_TX_8 FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 3) +#define ST_UART011_DMAWM_TX_16 FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 4) +#define ST_UART011_DMAWM_TX_32 FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 5) +#define ST_UART011_DMAWM_TX_48 FIELD_PREP_CONST(ST_UART011_DMAWM_TX, 6) + +#define UART010_IIR_RTIS BIT(3) +#define UART010_IIR_TIS BIT(2) +#define UART010_IIR_RIS BIT(1) +#define UART010_IIR_MIS BIT(0) + +#define UART011_IFLS_RXIFLSEL GENMASK(5, 3) +#define UART011_IFLS_RX1_8 FIELD_PREP_CONST(UART011_IFLS_RXIFLSEL, 0) +#define UART011_IFLS_RX2_8 FIELD_PREP_CONST(UART011_IFLS_RXIFLSEL, 1) +#define UART011_IFLS_RX4_8 FIELD_PREP_CONST(UART011_IFLS_RXIFLSEL, 2) +#define UART011_IFLS_RX6_8 FIELD_PREP_CONST(UART011_IFLS_RXIFLSEL, 3) +#define UART011_IFLS_RX7_8 FIELD_PREP_CONST(UART011_IFLS_RXIFLSEL, 4) +#define UART011_IFLS_TXIFLSEL GENMASK(2, 0) +#define UART011_IFLS_TX1_8 FIELD_PREP_CONST(UART011_IFLS_TXIFLSEL, 0) +#define UART011_IFLS_TX2_8 FIELD_PREP_CONST(UART011_IFLS_TXIFLSEL, 1) +#define UART011_IFLS_TX4_8 FIELD_PREP_CONST(UART011_IFLS_TXIFLSEL, 2) +#define UART011_IFLS_TX6_8 FIELD_PREP_CONST(UART011_IFLS_TXIFLSEL, 3) +#define UART011_IFLS_TX7_8 FIELD_PREP_CONST(UART011_IFLS_TXIFLSEL, 4) /* special values for ST vendor with deeper fifo */ -#define UART011_IFLS_RX_HALF (5 << 3) -#define UART011_IFLS_TX_HALF (5 << 0) - -#define UART011_OEIM (1 << 10) /* overrun error interrupt mask */ -#define UART011_BEIM (1 << 9) /* break error interrupt mask */ -#define UART011_PEIM (1 << 8) /* parity error interrupt mask */ -#define UART011_FEIM (1 << 7) /* framing error interrupt mask */ -#define UART011_RTIM (1 << 6) /* receive timeout interrupt mask */ -#define UART011_TXIM (1 << 5) /* transmit interrupt mask */ -#define UART011_RXIM (1 << 4) /* receive interrupt mask */ -#define UART011_DSRMIM (1 << 3) /* DSR interrupt mask */ -#define UART011_DCDMIM (1 << 2) /* DCD interrupt mask */ -#define UART011_CTSMIM (1 << 1) /* CTS interrupt mask */ -#define UART011_RIMIM (1 << 0) /* RI interrupt mask */ - -#define UART011_OEIS (1 << 10) /* overrun error interrupt status */ -#define UART011_BEIS (1 << 9) /* break error interrupt status */ -#define UART011_PEIS (1 << 8) /* parity error interrupt status */ -#define UART011_FEIS (1 << 7) /* framing error interrupt status */ -#define UART011_RTIS (1 << 6) /* receive timeout interrupt status */ -#define UART011_TXIS (1 << 5) /* transmit interrupt status */ -#define UART011_RXIS (1 << 4) /* receive interrupt status */ -#define UART011_DSRMIS (1 << 3) /* DSR interrupt status */ -#define UART011_DCDMIS (1 << 2) /* DCD interrupt status */ -#define UART011_CTSMIS (1 << 1) /* CTS interrupt status */ -#define UART011_RIMIS (1 << 0) /* RI interrupt status */ - -#define UART011_OEIC (1 << 10) /* overrun error interrupt clear */ -#define UART011_BEIC (1 << 9) /* break error interrupt clear */ -#define UART011_PEIC (1 << 8) /* parity error interrupt clear */ -#define UART011_FEIC (1 << 7) /* framing error interrupt clear */ -#define UART011_RTIC (1 << 6) /* receive timeout interrupt clear */ -#define UART011_TXIC (1 << 5) /* transmit interrupt clear */ -#define UART011_RXIC (1 << 4) /* receive interrupt clear */ -#define UART011_DSRMIC (1 << 3) /* DSR interrupt clear */ -#define UART011_DCDMIC (1 << 2) /* DCD interrupt clear */ -#define UART011_CTSMIC (1 << 1) /* CTS interrupt clear */ -#define UART011_RIMIC (1 << 0) /* RI interrupt clear */ - -#define UART011_DMAONERR (1 << 2) /* disable dma on error */ -#define UART011_TXDMAE (1 << 1) /* enable transmit dma */ -#define UART011_RXDMAE (1 << 0) /* enable receive dma */ +#define UART011_IFLS_RX_HALF FIELD_PREP_CONST(UART011_IFLS_RXIFLSEL, 5) +#define UART011_IFLS_TX_HALF FIELD_PREP_CONST(UART011_IFLS_TXIFLSEL, 5) + +#define UART011_OEIM BIT(10) /* overrun error interrupt mask */ +#define UART011_BEIM BIT(9) /* break error interrupt mask */ +#define UART011_PEIM BIT(8) /* parity error interrupt mask */ +#define UART011_FEIM BIT(7) /* framing error interrupt mask */ +#define UART011_RTIM BIT(6) /* receive timeout interrupt mask */ +#define UART011_TXIM BIT(5) /* transmit interrupt mask */ +#define UART011_RXIM BIT(4) /* receive interrupt mask */ +#define UART011_DSRMIM BIT(3) /* DSR interrupt mask */ +#define UART011_DCDMIM BIT(2) /* DCD interrupt mask */ +#define UART011_CTSMIM BIT(1) /* CTS interrupt mask */ +#define UART011_RIMIM BIT(0) /* RI interrupt mask */ + +#define UART011_OEIS BIT(10) /* overrun error interrupt status */ +#define UART011_BEIS BIT(9) /* break error interrupt status */ +#define UART011_PEIS BIT(8) /* parity error interrupt status */ +#define UART011_FEIS BIT(7) /* framing error interrupt status */ +#define UART011_RTIS BIT(6) /* receive timeout interrupt status */ +#define UART011_TXIS BIT(5) /* transmit interrupt status */ +#define UART011_RXIS BIT(4) /* receive interrupt status */ +#define UART011_DSRMIS BIT(3) /* DSR interrupt status */ +#define UART011_DCDMIS BIT(2) /* DCD interrupt status */ +#define UART011_CTSMIS BIT(1) /* CTS interrupt status */ +#define UART011_RIMIS BIT(0) /* RI interrupt status */ + +#define UART011_OEIC BIT(10) /* overrun error interrupt clear */ +#define UART011_BEIC BIT(9) /* break error interrupt clear */ +#define UART011_PEIC BIT(8) /* parity error interrupt clear */ +#define UART011_FEIC BIT(7) /* framing error interrupt clear */ +#define UART011_RTIC BIT(6) /* receive timeout interrupt clear */ +#define UART011_TXIC BIT(5) /* transmit interrupt clear */ +#define UART011_RXIC BIT(4) /* receive interrupt clear */ +#define UART011_DSRMIC BIT(3) /* DSR interrupt clear */ +#define UART011_DCDMIC BIT(2) /* DCD interrupt clear */ +#define UART011_CTSMIC BIT(1) /* CTS interrupt clear */ +#define UART011_RIMIC BIT(0) /* RI interrupt clear */ + +#define UART011_DMAONERR BIT(2) /* disable dma on error */ +#define UART011_TXDMAE BIT(1) /* enable transmit dma */ +#define UART011_RXDMAE BIT(0) /* enable receive dma */ #define UART01x_RSR_ANY (UART01x_RSR_OE | UART01x_RSR_BE | UART01x_RSR_PE | UART01x_RSR_FE) #define UART01x_FR_MODEM_ANY (UART01x_FR_DCD | UART01x_FR_DSR | UART01x_FR_CTS) -- cgit v1.2.3 From 0c734c5ea76e333fbb8dd83b5bab46291b38096b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 14 Dec 2023 11:08:15 -0700 Subject: block: improve struct request_queue layout It's clearly been a while since someone looked at this, so I gave it a quick shot. There are few issues in here: - Random bundling of members that are mostly read-only and often written - Random holes that need not be there This moves the most frequently used bits into cacheline 1 and 2, with the 2nd one being more write intensive than the first one, which is basically read-only. Outside of making this work a bit more efficiently, it also reduces the size of struct request_queue for my test setup from 864 bytes (spanning 14 cachelines!) to 832 bytes and 13 cachelines. Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/d2b7b61c-4868-45c0-9060-4f9c73de9d7e@kernel.dk Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 89 ++++++++++++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 17c0a7d0d319e..185ed3770e3a9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -367,59 +367,51 @@ struct blk_independent_access_ranges { }; struct request_queue { - struct request *last_merge; - struct elevator_queue *elevator; - - struct percpu_ref q_usage_counter; + /* + * The queue owner gets to use this for whatever they like. + * ll_rw_blk doesn't touch it. + */ + void *queuedata; - struct blk_queue_stats *stats; - struct rq_qos *rq_qos; - struct mutex rq_qos_mutex; + struct elevator_queue *elevator; const struct blk_mq_ops *mq_ops; /* sw queues */ struct blk_mq_ctx __percpu *queue_ctx; + /* + * various queue flags, see QUEUE_* below + */ + unsigned long queue_flags; + + unsigned int rq_timeout; + unsigned int queue_depth; + refcount_t refs; + /* hw dispatch queues */ - struct xarray hctx_table; unsigned int nr_hw_queues; + struct xarray hctx_table; - /* - * The queue owner gets to use this for whatever they like. - * ll_rw_blk doesn't touch it. - */ - void *queuedata; - - /* - * various queue flags, see QUEUE_* below - */ - unsigned long queue_flags; - /* - * Number of contexts that have called blk_set_pm_only(). If this - * counter is above zero then only RQF_PM requests are processed. - */ - atomic_t pm_only; + struct percpu_ref q_usage_counter; - /* - * ida allocated id for this queue. Used to index queues from - * ioctx. - */ - int id; + struct request *last_merge; spinlock_t queue_lock; - struct gendisk *disk; + int quiesce_depth; - refcount_t refs; + struct gendisk *disk; /* * mq queue kobject */ struct kobject *mq_kobj; + struct queue_limits limits; + #ifdef CONFIG_BLK_DEV_INTEGRITY struct blk_integrity integrity; #endif /* CONFIG_BLK_DEV_INTEGRITY */ @@ -430,24 +422,40 @@ struct request_queue { #endif /* - * queue settings + * Number of contexts that have called blk_set_pm_only(). If this + * counter is above zero then only RQF_PM requests are processed. */ - unsigned long nr_requests; /* Max # of requests */ + atomic_t pm_only; + + struct blk_queue_stats *stats; + struct rq_qos *rq_qos; + struct mutex rq_qos_mutex; + + /* + * ida allocated id for this queue. Used to index queues from + * ioctx. + */ + int id; unsigned int dma_pad_mask; + /* + * queue settings + */ + unsigned long nr_requests; /* Max # of requests */ + #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct blk_crypto_profile *crypto_profile; struct kobject *crypto_kobject; #endif - unsigned int rq_timeout; - struct timer_list timeout; struct work_struct timeout_work; atomic_t nr_active_requests_shared_tags; + unsigned int required_elevator_features; + struct blk_mq_tags *sched_shared_tags; struct list_head icq_list; @@ -458,11 +466,12 @@ struct request_queue { struct mutex blkcg_mutex; #endif - struct queue_limits limits; + int node; - unsigned int required_elevator_features; + spinlock_t requeue_lock; + struct list_head requeue_list; + struct delayed_work requeue_work; - int node; #ifdef CONFIG_BLK_DEV_IO_TRACE struct blk_trace __rcu *blk_trace; #endif @@ -472,10 +481,6 @@ struct request_queue { struct blk_flush_queue *fq; struct list_head flush_list; - struct list_head requeue_list; - spinlock_t requeue_lock; - struct delayed_work requeue_work; - struct mutex sysfs_lock; struct mutex sysfs_dir_lock; @@ -500,8 +505,6 @@ struct request_queue { */ struct mutex mq_freeze_lock; - int quiesce_depth; - struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; -- cgit v1.2.3 From 826a5d8c9df9605fb4fdefa45432f95580241a1f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 25 Oct 2023 21:42:57 +0300 Subject: device property: Implement device_is_big_endian() Some users want to use the struct device pointer to see if the device is big endian in terms of Open Firmware specifications, i.e. if it has a "big-endian" property, or if the kernel was compiled for BE *and* the device has a "native-endian" property. Provide inline helper for the users. Signed-off-by: Andy Shevchenko Acked-by: Greg Kroah-Hartman Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20231025184259.250588-2-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/property.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 9f2585d705a86..55c2692ffa8ca 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -80,12 +80,38 @@ int fwnode_property_match_string(const struct fwnode_handle *fwnode, bool fwnode_device_is_available(const struct fwnode_handle *fwnode); +static inline bool fwnode_device_is_big_endian(const struct fwnode_handle *fwnode) +{ + if (fwnode_property_present(fwnode, "big-endian")) + return true; + if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) && + fwnode_property_present(fwnode, "native-endian")) + return true; + return false; +} + static inline bool fwnode_device_is_compatible(const struct fwnode_handle *fwnode, const char *compat) { return fwnode_property_match_string(fwnode, "compatible", compat) >= 0; } +/** + * device_is_big_endian - check if a device has BE registers + * @dev: Pointer to the struct device + * + * Returns: true if the device has a "big-endian" property, or if the kernel + * was compiled for BE *and* the device has a "native-endian" property. + * Returns false otherwise. + * + * Callers would nominally use ioread32be/iowrite32be if + * device_is_big_endian() == true, or readl/writel otherwise. + */ +static inline bool device_is_big_endian(const struct device *dev) +{ + return fwnode_device_is_big_endian(dev_fwnode(dev)); +} + /** * device_is_compatible - match 'compatible' property of the device with a given string * @dev: Pointer to the struct device -- cgit v1.2.3 From c27dfca4555bf74dd7dd7161d8ef2790ec1c7283 Mon Sep 17 00:00:00 2001 From: Ricky Wu Date: Fri, 8 Dec 2023 11:21:43 +0800 Subject: misc: rtsx: add to support new card reader rts5264 new definition and function in order to support NEW chip rts5264, the definitions of some internal registers are define in new file rts5264.h, and some callback functions and the workflow for rts5264 are define in new file rts5264.c also add rts5264.o to Makefile Signed-off-by: Ricky Wu Link: https://lore.kernel.org/r/20231208032145.2143580-2-ricky_wu@realtek.com Signed-off-by: Greg Kroah-Hartman --- include/linux/rtsx_pci.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rtsx_pci.h b/include/linux/rtsx_pci.h index 534038d962e4f..4612ef09a0c76 100644 --- a/include/linux/rtsx_pci.h +++ b/include/linux/rtsx_pci.h @@ -60,6 +60,7 @@ #define SD_EXIST (1 << 16) #define DELINK_INT GPIO0_INT #define MS_OC_INT (1 << 23) +#define SD_OVP_INT (1 << 23) #define SD_OC_INT (1 << 22) #define CARD_INT (XD_INT | MS_INT | SD_INT) @@ -80,6 +81,7 @@ #define OC_INT_EN (1 << 23) #define DELINK_INT_EN GPIO0_INT_EN #define MS_OC_INT_EN (1 << 23) +#define SD_OVP_INT_EN (1 << 23) #define SD_OC_INT_EN (1 << 22) #define RTSX_DUM_REG 0x1C @@ -583,6 +585,7 @@ #define OBFF_DISABLE 0x00 #define CDRESUMECTL 0xFE52 +#define CDGW 0xFE53 #define WAKE_SEL_CTL 0xFE54 #define PCLK_CTL 0xFE55 #define PCLK_MODE_SEL 0x20 @@ -764,6 +767,9 @@ #define SD_VIO_LDO_1V8 0x40 #define SD_VIO_LDO_3V3 0x70 +#define RTS5264_AUTOLOAD_CFG2 0xFF7D +#define RTS5264_CHIP_RST_N_SEL (1 << 6) + #define RTS5260_AUTOLOAD_CFG4 0xFF7F #define RTS5260_MIMO_DISABLE 0x8A /*RTS5261*/ @@ -1261,6 +1267,7 @@ struct rtsx_pcr { u8 dma_error_count; u8 ocp_stat; u8 ocp_stat2; + u8 ovp_stat; u8 rtd3_en; }; @@ -1271,6 +1278,7 @@ struct rtsx_pcr { #define PID_5260 0x5260 #define PID_5261 0x5261 #define PID_5228 0x5228 +#define PID_5264 0x5264 #define CHK_PCI_PID(pcr, pid) ((pcr)->pci->device == (pid)) #define PCI_VID(pcr) ((pcr)->pci->vendor) -- cgit v1.2.3 From b6e53731e07db7e8d35b789fd83565fe75540180 Mon Sep 17 00:00:00 2001 From: Fei Shao Date: Wed, 6 Dec 2023 15:17:26 -0800 Subject: spmi: Introduce device-managed functions Utilize the managed resource (devres) framework and add the following devm_* helpers for the SPMI driver: - devm_spmi_controller_alloc() - devm_spmi_controller_add() [sboyd@kernel.org: Rename to spmi-devres for module niceness, slap on GPL module license] Signed-off-by: Fei Shao Link: https://lore.kernel.org/r/20230824104101.4083400-2-fshao@chromium.org Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20231206231733.4031901-4-sboyd@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/spmi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/spmi.h b/include/linux/spmi.h index 2a4ce4144f9f1..28e8c8bd39441 100644 --- a/include/linux/spmi.h +++ b/include/linux/spmi.h @@ -120,6 +120,9 @@ static inline void spmi_controller_put(struct spmi_controller *ctrl) int spmi_controller_add(struct spmi_controller *ctrl); void spmi_controller_remove(struct spmi_controller *ctrl); +struct spmi_controller *devm_spmi_controller_alloc(struct device *parent, size_t size); +int devm_spmi_controller_add(struct device *parent, struct spmi_controller *ctrl); + /** * struct spmi_driver - SPMI slave device driver * @driver: SPMI device drivers should initialize name and owner field of -- cgit v1.2.3 From 8d6608e4f89a0a21caadcf32fb5ed700e2f5682d Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Thu, 14 Dec 2023 15:53:47 +0100 Subject: firmware: xilinx: Remove clock_setrate and clock_getrate api As per the current code base, PM_CLOCK_SETRATE and PM_CLOCK_GETRATE APIs are not supported for the runtime operations. In the case of ZynqMP returning an error from TF-A when there is any request to access these APIs and for Versal also it is returning an error like NO_ACCESS from the firmware. So, just removing the unused code to avoid the confusion around these APIs. Also, there is no issue with the backward compatibility as these APIs were never used since implemented. Hence no need to bump up the version of the feature check API as well. Signed-off-by: Ronak Jain Signed-off-by: Michal Simek Link: https://lore.kernel.org/r/6ccbffbafd1f0f48f6574d5a3bf2db6a5603fdb0.1702565618.git.michal.simek@amd.com Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware/xlnx-zynqmp.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 6b48294f3c923..c6a7fb1f980b2 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -160,8 +160,6 @@ enum pm_api_id { PM_CLOCK_GETSTATE = 38, PM_CLOCK_SETDIVIDER = 39, PM_CLOCK_GETDIVIDER = 40, - PM_CLOCK_SETRATE = 41, - PM_CLOCK_GETRATE = 42, PM_CLOCK_SETPARENT = 43, PM_CLOCK_GETPARENT = 44, PM_FPGA_READ = 46, @@ -533,8 +531,6 @@ int zynqmp_pm_clock_disable(u32 clock_id); int zynqmp_pm_clock_getstate(u32 clock_id, u32 *state); int zynqmp_pm_clock_setdivider(u32 clock_id, u32 divider); int zynqmp_pm_clock_getdivider(u32 clock_id, u32 *divider); -int zynqmp_pm_clock_setrate(u32 clock_id, u64 rate); -int zynqmp_pm_clock_getrate(u32 clock_id, u64 *rate); int zynqmp_pm_clock_setparent(u32 clock_id, u32 parent_id); int zynqmp_pm_clock_getparent(u32 clock_id, u32 *parent_id); int zynqmp_pm_set_pll_frac_mode(u32 clk_id, u32 mode); @@ -639,16 +635,6 @@ static inline int zynqmp_pm_clock_getdivider(u32 clock_id, u32 *divider) return -ENODEV; } -static inline int zynqmp_pm_clock_setrate(u32 clock_id, u64 rate) -{ - return -ENODEV; -} - -static inline int zynqmp_pm_clock_getrate(u32 clock_id, u64 *rate) -{ - return -ENODEV; -} - static inline int zynqmp_pm_clock_setparent(u32 clock_id, u32 parent_id) { return -ENODEV; -- cgit v1.2.3 From b9ae996210163e89a2a9aece7c582fb43694485a Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Thu, 14 Dec 2023 15:53:48 +0100 Subject: firmware: xilinx: Remove zynqmp_pm_pinctrl_get_function() There is no user for this interface that's why remove it. Signed-off-by: Michal Simek Link: https://lore.kernel.org/r/e52a415a004e28a43e6d08e9e22d9e8fef3737df.1702565618.git.michal.simek@amd.com Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware/xlnx-zynqmp.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index c6a7fb1f980b2..1478f691cc10e 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -149,7 +149,6 @@ enum pm_api_id { PM_SECURE_SHA = 26, PM_PINCTRL_REQUEST = 28, PM_PINCTRL_RELEASE = 29, - PM_PINCTRL_GET_FUNCTION = 30, PM_PINCTRL_SET_FUNCTION = 31, PM_PINCTRL_CONFIG_PARAM_GET = 32, PM_PINCTRL_CONFIG_PARAM_SET = 33, @@ -567,7 +566,6 @@ int zynqmp_pm_system_shutdown(const u32 type, const u32 subtype); int zynqmp_pm_set_boot_health_status(u32 value); int zynqmp_pm_pinctrl_request(const u32 pin); int zynqmp_pm_pinctrl_release(const u32 pin); -int zynqmp_pm_pinctrl_get_function(const u32 pin, u32 *id); int zynqmp_pm_pinctrl_set_function(const u32 pin, const u32 id); int zynqmp_pm_pinctrl_get_config(const u32 pin, const u32 param, u32 *value); @@ -804,11 +802,6 @@ static inline int zynqmp_pm_pinctrl_release(const u32 pin) return -ENODEV; } -static inline int zynqmp_pm_pinctrl_get_function(const u32 pin, u32 *id) -{ - return -ENODEV; -} - static inline int zynqmp_pm_is_function_supported(const u32 api_id, const u32 id) { return -ENODEV; -- cgit v1.2.3 From f8fa5d76925991976b3e7076f9d1052515ec1fca Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Dec 2023 13:24:10 -0700 Subject: cred: switch to using atomic_long_t There are multiple ways to grab references to credentials, and the only protection we have against overflowing it is the memory required to do so. With memory sizes only moving in one direction, let's bump the reference count to 64-bit and move it outside the realm of feasibly overflowing. Signed-off-by: Jens Axboe Signed-off-by: Linus Torvalds --- include/linux/cred.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index af8d353a4b86a..a3383f8efb8fc 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -109,7 +109,7 @@ static inline int groups_search(const struct group_info *group_info, kgid_t grp) * same context as task->real_cred. */ struct cred { - atomic_t usage; + atomic_long_t usage; #ifdef CONFIG_DEBUG_CREDENTIALS atomic_t subscribers; /* number of processes subscribed */ void *put_addr; @@ -229,7 +229,7 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred) */ static inline struct cred *get_new_cred_many(struct cred *cred, int nr) { - atomic_add(nr, &cred->usage); + atomic_long_add(nr, &cred->usage); return cred; } @@ -288,7 +288,7 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred) struct cred *nonconst_cred = (struct cred *) cred; if (!cred) return NULL; - if (!atomic_inc_not_zero(&nonconst_cred->usage)) + if (!atomic_long_inc_not_zero(&nonconst_cred->usage)) return NULL; validate_creds(cred); nonconst_cred->non_rcu = 0; @@ -313,7 +313,7 @@ static inline void put_cred_many(const struct cred *_cred, int nr) if (cred) { validate_creds(cred); - if (atomic_sub_and_test(nr, &cred->usage)) + if (atomic_long_sub_and_test(nr, &cred->usage)) __put_cred(cred); } } -- cgit v1.2.3 From ae1914174a63a558113e80d24ccac2773f9f7b2b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Dec 2023 13:40:57 -0700 Subject: cred: get rid of CONFIG_DEBUG_CREDENTIALS This code is rarely (never?) enabled by distros, and it hasn't caught anything in decades. Let's kill off this legacy debug code. Suggested-by: Linus Torvalds Signed-off-by: Jens Axboe Signed-off-by: Linus Torvalds --- include/linux/cred.h | 50 -------------------------------------------------- 1 file changed, 50 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index a3383f8efb8fc..2976f534a7a32 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -110,13 +110,6 @@ static inline int groups_search(const struct group_info *group_info, kgid_t grp) */ struct cred { atomic_long_t usage; -#ifdef CONFIG_DEBUG_CREDENTIALS - atomic_t subscribers; /* number of processes subscribed */ - void *put_addr; - unsigned magic; -#define CRED_MAGIC 0x43736564 -#define CRED_MAGIC_DEAD 0x44656144 -#endif kuid_t uid; /* real UID of the task */ kgid_t gid; /* real GID of the task */ kuid_t suid; /* saved UID of the task */ @@ -172,46 +165,6 @@ extern int cred_fscmp(const struct cred *, const struct cred *); extern void __init cred_init(void); extern int set_cred_ucounts(struct cred *); -/* - * check for validity of credentials - */ -#ifdef CONFIG_DEBUG_CREDENTIALS -extern void __noreturn __invalid_creds(const struct cred *, const char *, unsigned); -extern void __validate_process_creds(struct task_struct *, - const char *, unsigned); - -extern bool creds_are_invalid(const struct cred *cred); - -static inline void __validate_creds(const struct cred *cred, - const char *file, unsigned line) -{ - if (unlikely(creds_are_invalid(cred))) - __invalid_creds(cred, file, line); -} - -#define validate_creds(cred) \ -do { \ - __validate_creds((cred), __FILE__, __LINE__); \ -} while(0) - -#define validate_process_creds() \ -do { \ - __validate_process_creds(current, __FILE__, __LINE__); \ -} while(0) - -extern void validate_creds_for_do_exit(struct task_struct *); -#else -static inline void validate_creds(const struct cred *cred) -{ -} -static inline void validate_creds_for_do_exit(struct task_struct *tsk) -{ -} -static inline void validate_process_creds(void) -{ -} -#endif - static inline bool cap_ambient_invariant_ok(const struct cred *cred) { return cap_issubset(cred->cap_ambient, @@ -264,7 +217,6 @@ static inline const struct cred *get_cred_many(const struct cred *cred, int nr) struct cred *nonconst_cred = (struct cred *) cred; if (!cred) return cred; - validate_creds(cred); nonconst_cred->non_rcu = 0; return get_new_cred_many(nonconst_cred, nr); } @@ -290,7 +242,6 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred) return NULL; if (!atomic_long_inc_not_zero(&nonconst_cred->usage)) return NULL; - validate_creds(cred); nonconst_cred->non_rcu = 0; return cred; } @@ -312,7 +263,6 @@ static inline void put_cred_many(const struct cred *_cred, int nr) struct cred *cred = (struct cred *) _cred; if (cred) { - validate_creds(cred); if (atomic_long_sub_and_test(nr, &cred->usage)) __put_cred(cred); } -- cgit v1.2.3 From 4382159696c9af67ee047ed55f2dbf05480f52f6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Dec 2023 10:12:17 +0100 Subject: cfi: Flip headers Normal include order is that linux/foo.h should include asm/foo.h, CFI has it the wrong way around. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sami Tolvanen Link: https://lore.kernel.org/r/20231215092707.231038174@infradead.org Signed-off-by: Alexei Starovoitov --- include/linux/cfi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cfi.h b/include/linux/cfi.h index 3552ec82b7256..2309d74e77e68 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -9,6 +9,7 @@ #include #include +#include #ifdef CONFIG_CFI_CLANG enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, -- cgit v1.2.3 From 4f9087f16651aca4a5f32da840a53f6660f0579a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Dec 2023 10:12:18 +0100 Subject: x86/cfi,bpf: Fix BPF JIT call The current BPF call convention is __nocfi, except when it calls !JIT things, then it calls regular C functions. It so happens that with FineIBT the __nocfi and C calling conventions are incompatible. Specifically __nocfi will call at func+0, while FineIBT will have endbr-poison there, which is not a valid indirect target. Causing #CP. Notably this only triggers on IBT enabled hardware, which is probably why this hasn't been reported (also, most people will have JIT on anyway). Implement proper CFI prologues for the BPF JIT codegen and drop __nocfi for x86. Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231215092707.345270396@infradead.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 12 ++++++++++-- include/linux/cfi.h | 7 +++++++ 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c87c608a36892..9d84c376851af 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -29,6 +29,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -1211,7 +1212,11 @@ struct bpf_dispatcher { #endif }; -static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func( +#ifndef __bpfcall +#define __bpfcall __nocfi +#endif + +static __always_inline __bpfcall unsigned int bpf_dispatcher_nop_func( const void *ctx, const struct bpf_insn *insnsi, bpf_func_t bpf_func) @@ -1303,7 +1308,7 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func #define DEFINE_BPF_DISPATCHER(name) \ __BPF_DISPATCHER_SC(name); \ - noinline __nocfi unsigned int bpf_dispatcher_##name##_func( \ + noinline __bpfcall unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ const struct bpf_insn *insnsi, \ bpf_func_t bpf_func) \ @@ -1453,6 +1458,9 @@ struct bpf_prog_aux { struct bpf_kfunc_desc_tab *kfunc_tab; struct bpf_kfunc_btf_tab *kfunc_btf_tab; u32 size_poke_tab; +#ifdef CONFIG_FINEIBT + struct bpf_ksym ksym_prefix; +#endif struct bpf_ksym ksym; const struct bpf_prog_ops *ops; struct bpf_map **used_maps; diff --git a/include/linux/cfi.h b/include/linux/cfi.h index 2309d74e77e68..1ed2d96c0cfc8 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -11,6 +11,13 @@ #include #include +#ifndef cfi_get_offset +static inline int cfi_get_offset(void) +{ + return 0; +} +#endif + #ifdef CONFIG_CFI_CLANG enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, unsigned long *target, u32 type); -- cgit v1.2.3 From 2cd3e3772e41377f32d6eea643e0590774e9187c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Dec 2023 10:12:20 +0100 Subject: x86/cfi,bpf: Fix bpf_struct_ops CFI BPF struct_ops uses __arch_prepare_bpf_trampoline() to write trampolines for indirect function calls. These tramplines much have matching CFI. In order to obtain the correct CFI hash for the various methods, add a matching structure that contains stub functions, the compiler will generate correct CFI which we can pilfer for the trampolines. Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231215092707.566977112@infradead.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9d84c376851af..db46b3359bf5f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1060,6 +1060,17 @@ struct btf_func_model { */ #define BPF_TRAMP_F_TAIL_CALL_CTX BIT(7) +/* + * Indicate the trampoline should be suitable to receive indirect calls; + * without this indirectly calling the generated code can result in #UD/#CP, + * depending on the CFI options. + * + * Used by bpf_struct_ops. + * + * Incompatible with FENTRY usage, overloads @func_addr argument. + */ +#define BPF_TRAMP_F_INDIRECT BIT(8) + /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. */ @@ -1697,6 +1708,7 @@ struct bpf_struct_ops { struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS]; u32 type_id; u32 value_id; + void *cfi_stubs; }; #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) @@ -1710,6 +1722,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, struct bpf_tramp_link *link, const struct btf_func_model *model, + void *stub_func, void *image, void *image_end); static inline bool bpf_try_module_get(const void *data, struct module *owner) { -- cgit v1.2.3 From e9d13b9d2f99ccf7afeab490d97eaa5ac9846598 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Dec 2023 10:12:21 +0100 Subject: cfi: Add CFI_NOSEAL() Add a CFI_NOSEAL() helper to mark functions that need to retain their CFI information, despite not otherwise leaking their address. Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231215092707.669401084@infradead.org Signed-off-by: Alexei Starovoitov --- include/linux/cfi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cfi.h b/include/linux/cfi.h index 1ed2d96c0cfc8..f0df518e11dd1 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -46,4 +46,8 @@ static inline void module_cfi_finalize(const Elf_Ehdr *hdr, #endif /* CONFIG_ARCH_USES_CFI_TRAPS */ #endif /* CONFIG_MODULES */ +#ifndef CFI_NOSEAL +#define CFI_NOSEAL(x) +#endif + #endif /* _LINUX_CFI_H */ -- cgit v1.2.3 From 852486b35f344887786d63250946dd921a05d7e8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 15 Dec 2023 10:12:23 +0100 Subject: x86/cfi,bpf: Fix bpf_exception_cb() signature As per the earlier patches, BPF sub-programs have bpf_callback_t signature and CFI expects callers to have matching signature. This is violated by bpf_prog_aux::bpf_exception_cb(). [peterz: Changelog] Reported-by: Peter Zijlstra Signed-off-by: Alexei Starovoitov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/CAADnVQ+Z7UcXXBBhMubhcMM=R-dExk-uHtfOLtoLxQ1XxEpqEA@mail.gmail.com Link: https://lore.kernel.org/r/20231215092707.910319166@infradead.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index db46b3359bf5f..5e694934cf37a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1484,7 +1484,7 @@ struct bpf_prog_aux { int cgroup_atype; /* enum cgroup_bpf_attach_type */ struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; char name[BPF_OBJ_NAME_LEN]; - unsigned int (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp); + u64 (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp, u64, u64); #ifdef CONFIG_SECURITY void *security; #endif -- cgit v1.2.3 From 117211aa739a926e6555cfea883be84bee6f1695 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 16 Dec 2023 00:05:02 +0100 Subject: bpf: Add missing BPF_LINK_TYPE invocations Pengfei Xu reported [1] Syzkaller/KASAN issue found in bpf_link_show_fdinfo. The reason is missing BPF_LINK_TYPE invocation for uprobe multi link and for several other links, adding that. [1] https://lore.kernel.org/bpf/ZXptoKRSLspnk2ie@xpf.sh.intel.com/ Fixes: 89ae89f53d20 ("bpf: Add multi uprobe link") Fixes: e420bed02507 ("bpf: Add fd-based tcx multi-prog infra with link support") Fixes: 84601d6ee68a ("bpf: add bpf_link support for BPF_NETFILTER programs") Fixes: 35dfaad7188c ("netkit, bpf: Add bpf programmable net device") Reported-by: Pengfei Xu Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Tested-by: Pengfei Xu Acked-by: Hou Tao Link: https://lore.kernel.org/bpf/20231215230502.2769743-1-jolsa@kernel.org --- include/linux/bpf_types.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index fc0d6f32c6876..94baced5a1ad6 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -142,9 +142,13 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter) #ifdef CONFIG_NET BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns) BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp) +BPF_LINK_TYPE(BPF_LINK_TYPE_NETFILTER, netfilter) +BPF_LINK_TYPE(BPF_LINK_TYPE_TCX, tcx) +BPF_LINK_TYPE(BPF_LINK_TYPE_NETKIT, netkit) #endif #ifdef CONFIG_PERF_EVENTS BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf) #endif BPF_LINK_TYPE(BPF_LINK_TYPE_KPROBE_MULTI, kprobe_multi) BPF_LINK_TYPE(BPF_LINK_TYPE_STRUCT_OPS, struct_ops) +BPF_LINK_TYPE(BPF_LINK_TYPE_UPROBE_MULTI, uprobe_multi) -- cgit v1.2.3 From bb339db4d363c84e0a8d70827df591397ccd7312 Mon Sep 17 00:00:00 2001 From: James Clark Date: Fri, 15 Dec 2023 17:56:48 +0000 Subject: arm: perf: Fix ARCH=arm build with GCC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LLVM ignores everything inside the if statement and doesn't generate errors, but GCC doesn't ignore it, resulting in the following error: drivers/perf/arm_pmuv3.c: In function ‘armv8pmu_write_evtype’: include/linux/bits.h:34:29: error: left shift count >= width of type [-Werror=shift-count-overflow] 34 | (((~UL(0)) - (UL(1) << (l)) + 1) & \ Fix it by using GENMASK_ULL which doesn't overflow on arm32 (even though the value is never used there). Fixes: 3115ee021bfb ("arm64: perf: Include threshold control fields in PMEVTYPER mask") Reported-by: Uwe Kleine-König Closes: https://lore.kernel.org/linux-arm-kernel/20231215120817.h2f3akgv72zhrtqo@pengutronix.de/ Signed-off-by: James Clark Acked-by: Mark Rutland Reviewed-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20231215175648.3397170-2-james.clark@arm.com Signed-off-by: Will Deacon --- include/linux/perf/arm_pmuv3.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index 0f4d62ef3a9a1..46377e134d67c 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -234,8 +234,8 @@ * PMXEVTYPER: Event selection reg */ #define ARMV8_PMU_EVTYPE_EVENT GENMASK(15, 0) /* Mask for EVENT bits */ -#define ARMV8_PMU_EVTYPE_TH GENMASK(43, 32) -#define ARMV8_PMU_EVTYPE_TC GENMASK(63, 61) +#define ARMV8_PMU_EVTYPE_TH GENMASK_ULL(43, 32) /* arm64 only */ +#define ARMV8_PMU_EVTYPE_TC GENMASK_ULL(63, 61) /* arm64 only */ /* * Event filters for PMUv3 -- cgit v1.2.3 From ebb30ccbbdbd6fae5177b676da4f4ac92bb4f635 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Fri, 15 Dec 2023 14:15:31 +0100 Subject: net: phy: make addr type u8 in phy_package_shared struct Switch addr type in phy_package_shared struct to u8. The value is already checked to be non negative and to be less than PHY_MAX_ADDR, hence u8 is better suited than using int. Signed-off-by: Christian Marangi Reviewed-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index dbb5e13e3e1bf..4b13cc85c4f5b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -338,7 +338,7 @@ struct mdio_bus_stats { * phy_package_leave(). */ struct phy_package_shared { - int addr; + u8 addr; refcount_t refcnt; unsigned long flags; size_t priv_size; -- cgit v1.2.3 From 9eea577eb1155fe4a183bc5e7bf269b0b2e7a6ba Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Fri, 15 Dec 2023 14:15:32 +0100 Subject: net: phy: extend PHY package API to support multiple global address Current API for PHY package are limited to single address to configure global settings for the PHY package. It was found that some PHY package (for example the qca807x, a PHY package that is shipped with a bundle of 5 PHY) requires multiple PHY address to configure global settings. An example scenario is a PHY that have a dedicated PHY for PSGMII/serdes calibrarion and have a specific PHY in the package where the global PHY mode is set and affects every other PHY in the package. Change the API in the following way: - Change phy_package_join() to take the base addr of the PHY package instead of the global PHY addr. - Make __/phy_package_write/read() require an additional arg that select what global PHY address to use by passing the offset from the base addr passed on phy_package_join(). Each user of this API is updated to follow this new implementation following a pattern where an enum is defined to declare the offset of the addr. We also drop the check if shared is defined as any user of the phy_package_read/write is expected to use phy_package_join first. Misuse of this will correctly trigger a kernel panic for NULL pointer exception. Signed-off-by: Christian Marangi Signed-off-by: David S. Miller --- include/linux/phy.h | 64 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 4b13cc85c4f5b..d653f660c39d7 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -327,7 +327,8 @@ struct mdio_bus_stats { /** * struct phy_package_shared - Shared information in PHY packages - * @addr: Common PHY address used to combine PHYs in one package + * @base_addr: Base PHY address of PHY package used to combine PHYs + * in one package and for offset calculation of phy_package_read/write * @refcnt: Number of PHYs connected to this shared data * @flags: Initialization of PHY package * @priv_size: Size of the shared private data @priv @@ -338,7 +339,7 @@ struct mdio_bus_stats { * phy_package_leave(). */ struct phy_package_shared { - u8 addr; + u8 base_addr; refcount_t refcnt; unsigned long flags; size_t priv_size; @@ -1976,10 +1977,10 @@ int phy_ethtool_get_link_ksettings(struct net_device *ndev, int phy_ethtool_set_link_ksettings(struct net_device *ndev, const struct ethtool_link_ksettings *cmd); int phy_ethtool_nway_reset(struct net_device *ndev); -int phy_package_join(struct phy_device *phydev, int addr, size_t priv_size); +int phy_package_join(struct phy_device *phydev, int base_addr, size_t priv_size); void phy_package_leave(struct phy_device *phydev); int devm_phy_package_join(struct device *dev, struct phy_device *phydev, - int addr, size_t priv_size); + int base_addr, size_t priv_size); int __init mdio_bus_init(void); void mdio_bus_exit(void); @@ -2002,46 +2003,65 @@ int __phy_hwtstamp_set(struct phy_device *phydev, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack); -static inline int phy_package_read(struct phy_device *phydev, u32 regnum) +static inline int phy_package_address(struct phy_device *phydev, + unsigned int addr_offset) { struct phy_package_shared *shared = phydev->shared; + u8 base_addr = shared->base_addr; - if (!shared) + if (addr_offset >= PHY_MAX_ADDR - base_addr) return -EIO; - return mdiobus_read(phydev->mdio.bus, shared->addr, regnum); + /* we know that addr will be in the range 0..31 and thus the + * implicit cast to a signed int is not a problem. + */ + return base_addr + addr_offset; } -static inline int __phy_package_read(struct phy_device *phydev, u32 regnum) +static inline int phy_package_read(struct phy_device *phydev, + unsigned int addr_offset, u32 regnum) { - struct phy_package_shared *shared = phydev->shared; + int addr = phy_package_address(phydev, addr_offset); - if (!shared) - return -EIO; + if (addr < 0) + return addr; + + return mdiobus_read(phydev->mdio.bus, addr, regnum); +} + +static inline int __phy_package_read(struct phy_device *phydev, + unsigned int addr_offset, u32 regnum) +{ + int addr = phy_package_address(phydev, addr_offset); + + if (addr < 0) + return addr; - return __mdiobus_read(phydev->mdio.bus, shared->addr, regnum); + return __mdiobus_read(phydev->mdio.bus, addr, regnum); } static inline int phy_package_write(struct phy_device *phydev, - u32 regnum, u16 val) + unsigned int addr_offset, u32 regnum, + u16 val) { - struct phy_package_shared *shared = phydev->shared; + int addr = phy_package_address(phydev, addr_offset); - if (!shared) - return -EIO; + if (addr < 0) + return addr; - return mdiobus_write(phydev->mdio.bus, shared->addr, regnum, val); + return mdiobus_write(phydev->mdio.bus, addr, regnum, val); } static inline int __phy_package_write(struct phy_device *phydev, - u32 regnum, u16 val) + unsigned int addr_offset, u32 regnum, + u16 val) { - struct phy_package_shared *shared = phydev->shared; + int addr = phy_package_address(phydev, addr_offset); - if (!shared) - return -EIO; + if (addr < 0) + return addr; - return __mdiobus_write(phydev->mdio.bus, shared->addr, regnum, val); + return __mdiobus_write(phydev->mdio.bus, addr, regnum, val); } static inline bool __phy_package_set_once(struct phy_device *phydev, -- cgit v1.2.3 From d63710fc0f1a501fd75a7025e3070a96ffa1645f Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Fri, 15 Dec 2023 14:15:34 +0100 Subject: net: phy: add support for PHY package MMD read/write Some PHY in PHY package may require to read/write MMD regs to correctly configure the PHY package. Add support for these additional required function in both lock and no lock variant. It's assumed that the entire PHY package is either C22 or C45. We use C22 or C45 way of writing/reading to mmd regs based on the passed phydev whether it's C22 or C45. Signed-off-by: Christian Marangi Signed-off-by: David S. Miller --- include/linux/phy.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index d653f660c39d7..e9e85d3475872 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2064,6 +2064,22 @@ static inline int __phy_package_write(struct phy_device *phydev, return __mdiobus_write(phydev->mdio.bus, addr, regnum, val); } +int __phy_package_read_mmd(struct phy_device *phydev, + unsigned int addr_offset, int devad, + u32 regnum); + +int phy_package_read_mmd(struct phy_device *phydev, + unsigned int addr_offset, int devad, + u32 regnum); + +int __phy_package_write_mmd(struct phy_device *phydev, + unsigned int addr_offset, int devad, + u32 regnum, u16 val); + +int phy_package_write_mmd(struct phy_device *phydev, + unsigned int addr_offset, int devad, + u32 regnum, u16 val); + static inline bool __phy_package_set_once(struct phy_device *phydev, unsigned int b) { -- cgit v1.2.3 From 120931db07b49252aba2073096b595482d71857c Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 27 Nov 2023 23:36:52 -0600 Subject: rtc: Add support for configuring the UIP timeout for RTC reads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The UIP timeout is hardcoded to 10ms for all RTC reads, but in some contexts this might not be enough time. Add a timeout parameter to mc146818_get_time() and mc146818_get_time_callback(). If UIP timeout is configured by caller to be >=100 ms and a call takes this long, log a warning. Make all callers use 10ms to ensure no functional changes. Cc: # 6.1.y Fixes: ec5895c0f2d8 ("rtc: mc146818-lib: extract mc146818_avoid_UIP") Signed-off-by: Mario Limonciello Tested-by: Mateusz Jończyk Reviewed-by: Mateusz Jończyk Acked-by: Mateusz Jończyk Link: https://lore.kernel.org/r/20231128053653.101798-4-mario.limonciello@amd.com Signed-off-by: Alexandre Belloni --- include/linux/mc146818rtc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mc146818rtc.h b/include/linux/mc146818rtc.h index b0da04fe087bb..34dfcc77f505a 100644 --- a/include/linux/mc146818rtc.h +++ b/include/linux/mc146818rtc.h @@ -126,10 +126,11 @@ struct cmos_rtc_board_info { #endif /* ARCH_RTC_LOCATION */ bool mc146818_does_rtc_work(void); -int mc146818_get_time(struct rtc_time *time); +int mc146818_get_time(struct rtc_time *time, int timeout); int mc146818_set_time(struct rtc_time *time); bool mc146818_avoid_UIP(void (*callback)(unsigned char seconds, void *param), + int timeout, void *param); #endif /* _MC146818RTC_H */ -- cgit v1.2.3 From 32da0f00ddcb101730cf242289b2b10ede0e1156 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Fri, 15 Dec 2023 14:57:10 -0300 Subject: net: rtnl: introduce rcu_replace_pointer_rtnl Introduce the rcu_replace_pointer_rtnl helper to lockdep check rtnl lock rcu replacements, alongside the already existing helpers. This is a quality of life helper so instead of using: rcu_replace_pointer(rp, p, lockdep_rtnl_is_held()) .. or the open coded.. rtnl_dereference() / rcu_assign_pointer() .. or the lazy check version .. rcu_replace_pointer(rp, p, 1) Use: rcu_replace_pointer_rtnl(rp, p) Signed-off-by: Jamal Hadi Salim Signed-off-by: Victor Nogueira Signed-off-by: Pedro Tammela Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 6a8543b34e2c0..410529fca18b2 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -79,6 +79,18 @@ static inline bool lockdep_rtnl_is_held(void) #define rtnl_dereference(p) \ rcu_dereference_protected(p, lockdep_rtnl_is_held()) +/** + * rcu_replace_pointer_rtnl - replace an RCU pointer under rtnl_lock, returning + * its old value + * @rp: RCU pointer, whose value is returned + * @p: regular pointer + * + * Perform a replacement under rtnl_lock, where @rp is an RCU-annotated + * pointer. The old value of @rp is returned, and @rp is set to @p + */ +#define rcu_replace_pointer_rtnl(rp, p) \ + rcu_replace_pointer(rp, p, lockdep_rtnl_is_held()) + static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) { return rtnl_dereference(dev->ingress_queue); -- cgit v1.2.3 From b2adbc9cea752539f6421e9d4642408f666c1251 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alvin=20=C5=A0ipraga?= Date: Fri, 24 Nov 2023 14:17:44 +0100 Subject: clk: si5351: allow PLLs to be adjusted without reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a new PLL reset mode flag which controls whether or not to reset a PLL after adjusting its rate. The mode can be configured through platform data or device tree. Since commit 6dc669a22c77 ("clk: si5351: Add PLL soft reset"), the driver unconditionally resets a PLL whenever its rate is adjusted. The rationale was that a PLL reset was required to get three outputs working at the same time. Before this change, the driver never reset the PLLs. Commit b26ff127c52c ("clk: si5351: Apply PLL soft reset before enabling the outputs") subsequently introduced an option to reset the PLL when enabling a clock output that sourced it. Here, the rationale was that this is required to get a deterministic phase relationship between multiple output clocks. This clearly shows that it is useful to reset the PLLs in applications where multiple clock outputs are used. However, the Si5351 also allows for glitch-free rate adjustment of its PLLs if one avoids resetting the PLL. In our audio application where a single Si5351 clock output is used to supply a runtime adjustable bit clock, this unconditional PLL reset behaviour introduces unwanted glitches in the clock output. It would appear that the problem being solved in the former commit may be solved by using the optional device tree property introduced in the latter commit, obviating the need for an unconditional PLL reset after rate adjustment. But it's not OK to break the default behaviour of the driver, and it cannot be assumed that all device trees are using the property introduced in the latter commit. Hence, the new behaviour is made opt-in. Cc: Sebastian Hesselbarth Cc: Rabeeh Khoury Cc: Jacob Siverskog Cc: Sergej Sawazki Signed-off-by: Alvin Šipraga Acked-by: Sebastian Hesselbarth Link: https://lore.kernel.org/r/20231124-alvin-clk-si5351-no-pll-reset-v6-3-69b82311cb90@bang-olufsen.dk Signed-off-by: Stephen Boyd --- include/linux/platform_data/si5351.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/si5351.h b/include/linux/platform_data/si5351.h index c71a2dd661437..5f412a615532b 100644 --- a/include/linux/platform_data/si5351.h +++ b/include/linux/platform_data/si5351.h @@ -105,10 +105,12 @@ struct si5351_clkout_config { * @clk_xtal: xtal input clock * @clk_clkin: clkin input clock * @pll_src: array of pll source clock setting + * @pll_reset: array indicating if plls should be reset after setting the rate * @clkout: array of clkout configuration */ struct si5351_platform_data { enum si5351_pll_src pll_src[2]; + bool pll_reset[2]; struct si5351_clkout_config clkout[8]; }; -- cgit v1.2.3 From 5a2a2cda916335fff4d804e58f36b2305926841e Mon Sep 17 00:00:00 2001 From: Wang Jinchao Date: Mon, 18 Dec 2023 15:16:16 +0800 Subject: gpiolib: remove duplicate inclusions Remove second `#include `. Remove `#include ` too as it's included by `err.h`. Signed-off-by: Wang Jinchao Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index bd9bea7cb270a..e846bd4e7559b 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -722,7 +722,6 @@ int gpiochip_irqchip_add_domain(struct gpio_chip *gc, #else #include -#include static inline int gpiochip_irqchip_add_domain(struct gpio_chip *gc, struct irq_domain *domain) @@ -811,8 +810,6 @@ const char *gpio_device_get_label(struct gpio_device *gdev); #else /* CONFIG_GPIOLIB */ -#include - #include static inline struct gpio_chip *gpiod_to_chip(const struct gpio_desc *desc) -- cgit v1.2.3 From 58ff9c5acb4aef58e118bbf39736cc4d6c11a3d3 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 22 Nov 2023 15:03:51 +0900 Subject: PCI: Rename PCI_IRQ_LEGACY to PCI_IRQ_INTX Rename PCI_IRQ_LEGACY to PCI_IRQ_INTX to be more explicit about the type of IRQ being referenced as well as to match the PCI specifications terms. Redefine PCI_IRQ_LEGACY as an alias to PCI_IRQ_INTX to avoid the need for doing the renaming tree-wide. New drivers and new code should now prefer using PCI_IRQ_INTX instead of PCI_IRQ_LEGACY. Link: https://lore.kernel.org/r/20231122060406.14695-2-dlemoal@kernel.org Signed-off-by: Bjorn Helgaas Signed-off-by: Damien Le Moal Signed-off-by: Lorenzo Pieralisi Reviewed-by: Yoshihiro Shimoda Reviewed-by: Serge Semin Reviewed-by: Christoph Hellwig Acked-by: Manivannan Sadhasivam --- include/linux/pci.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 60ca768bc8679..9ab4b46c0d19c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1073,11 +1073,13 @@ enum { PCI_SCAN_ALL_PCIE_DEVS = 0x00000040, /* Scan all, not just dev 0 */ }; -#define PCI_IRQ_LEGACY (1 << 0) /* Allow legacy interrupts */ +#define PCI_IRQ_INTX (1 << 0) /* Allow INTx interrupts */ #define PCI_IRQ_MSI (1 << 1) /* Allow MSI interrupts */ #define PCI_IRQ_MSIX (1 << 2) /* Allow MSI-X interrupts */ #define PCI_IRQ_AFFINITY (1 << 3) /* Auto-assign affinity */ +#define PCI_IRQ_LEGACY PCI_IRQ_INTX /* Deprecated! Use PCI_IRQ_INTX */ + /* These external functions are only available when PCI support is enabled */ #ifdef CONFIG_PCI -- cgit v1.2.3 From 74955cb8ccc38539f8c029336e07e6b43b6a942e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 22 Nov 2023 15:03:52 +0900 Subject: PCI: endpoint: Drop PCI_EPC_IRQ_XXX definitions linux/pci.h defines the IRQ flags PCI_IRQ_INTX, PCI_IRQ_MSI and PCI_IRQ_MSIX. Let's use these flags directly instead of the endpoint definitions provided by enum pci_epc_irq_type. This removes the need for defining this enum type completely. Link: https://lore.kernel.org/r/20231122060406.14695-3-dlemoal@kernel.org Signed-off-by: Damien Le Moal Signed-off-by: Lorenzo Pieralisi Reviewed-by: Serge Semin Reviewed-by: Christoph Hellwig Acked-by: Manivannan Sadhasivam --- include/linux/pci-epc.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index 5cb6940310729..f498f9aa2ab0e 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -19,13 +19,6 @@ enum pci_epc_interface_type { SECONDARY_INTERFACE, }; -enum pci_epc_irq_type { - PCI_EPC_IRQ_UNKNOWN, - PCI_EPC_IRQ_LEGACY, - PCI_EPC_IRQ_MSI, - PCI_EPC_IRQ_MSIX, -}; - static inline const char * pci_epc_interface_string(enum pci_epc_interface_type type) { @@ -79,7 +72,7 @@ struct pci_epc_ops { u16 interrupts, enum pci_barno, u32 offset); int (*get_msix)(struct pci_epc *epc, u8 func_no, u8 vfunc_no); int (*raise_irq)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, - enum pci_epc_irq_type type, u16 interrupt_num); + unsigned int type, u16 interrupt_num); int (*map_msi_irq)(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t phys_addr, u8 interrupt_num, u32 entry_size, u32 *msi_data, @@ -229,7 +222,7 @@ int pci_epc_map_msi_irq(struct pci_epc *epc, u8 func_no, u8 vfunc_no, phys_addr_t phys_addr, u8 interrupt_num, u32 entry_size, u32 *msi_data, u32 *msi_addr_offset); int pci_epc_raise_irq(struct pci_epc *epc, u8 func_no, u8 vfunc_no, - enum pci_epc_irq_type type, u16 interrupt_num); + unsigned int type, u16 interrupt_num); int pci_epc_start(struct pci_epc *epc); void pci_epc_stop(struct pci_epc *epc); const struct pci_epc_features *pci_epc_get_features(struct pci_epc *epc, -- cgit v1.2.3 From 3314f2097dee43defc20554f961a8b17f4787e2d Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Tue, 5 Dec 2023 17:01:01 -0800 Subject: intel: add bit macro includes where needed This series is introducing the use of FIELD_GET and FIELD_PREP which requires bitfield.h to be included. Fix all the includes in this one change, and rearrange includes into alphabetical order to ease readability and future maintenance. virtchnl.h and it's usage was modified to have it's own includes as it should. This required including bits.h for virtchnl.h. Reviewed-by: Marcin Szycik Signed-off-by: Jesse Brandeburg Signed-off-by: Tony Nguyen --- include/linux/avf/virtchnl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index a44d9dc7e3eb6..8e177b67e82f4 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -5,6 +5,7 @@ #define _VIRTCHNL_H_ #include +#include #include #include -- cgit v1.2.3 From d81f0d7b8b23ec79f80be602ed6129ded27862e8 Mon Sep 17 00:00:00 2001 From: Rae Moar Date: Wed, 13 Dec 2023 19:44:17 +0000 Subject: kunit: add KUNIT_INIT_TABLE to init linker section Add KUNIT_INIT_TABLE to the INIT_DATA linker section. Alter the KUnit macros to create init tests: kunit_test_init_section_suites Update lib/kunit/executor.c to run both the suites in KUNIT_TABLE and KUNIT_INIT_TABLE. Reviewed-by: David Gow Signed-off-by: Rae Moar Signed-off-by: Shuah Khan --- include/linux/module.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index a98e188cf37b8..9cd0009bd050c 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -540,6 +540,8 @@ struct module { struct static_call_site *static_call_sites; #endif #if IS_ENABLED(CONFIG_KUNIT) + int num_kunit_init_suites; + struct kunit_suite **kunit_init_suites; int num_kunit_suites; struct kunit_suite **kunit_suites; #endif -- cgit v1.2.3 From 86362293044b382aece355f9e4e3f7116dcd1eae Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Sat, 22 Jul 2023 16:08:44 -0700 Subject: PCI: endpoint: Make struct pci_epf_ops in pci_epf_driver const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pci_epf_ops struct contains a set of callbacks that are used by the pci_epf_driver, and is never modified by the EPF core itself. Marking the struct pointer const allows EPF drivers to declare their pci_epf_ops struct to be const. This allows the struct to be placed in the read-only section. Which for example brings some security benefits as the callbacks can not be overwritten. [kwilczynski: commit log] Link: https://lore.kernel.org/linux-pci/20230722230848.589428-1-lars@metafoo.de Signed-off-by: Lars-Peter Clausen Signed-off-by: Krzysztof Wilczyński Reviewed-by: Manivannan Sadhasivam --- include/linux/pci-epf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h index 3f44b6aec4770..764dc62b3acc9 100644 --- a/include/linux/pci-epf.h +++ b/include/linux/pci-epf.h @@ -98,7 +98,7 @@ struct pci_epf_driver { void (*remove)(struct pci_epf *epf); struct device_driver driver; - struct pci_epf_ops *ops; + const struct pci_epf_ops *ops; struct module *owner; struct list_head epf_group; const struct pci_epf_device_id *id_table; -- cgit v1.2.3 From d23569979ca1cd139a42c410e0c7b9e6014c3b3a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 13 Dec 2023 09:37:01 -0500 Subject: tracing: Allow creating instances with specified system events A trace instance may only need to enable specific events. As the eventfs directory of an instance currently creates all events which adds overhead, allow internal instances to be created with just the events in systems that they care about. This currently only deals with systems and not individual events, but this should bring down the overhead of creating instances for specific use cases quite bit. The trace_array_get_by_name() now has another parameter "systems". This parameter is a const string pointer of a comma/space separated list of event systems that should be created by the trace_array. (Note if the trace_array already exists, this parameter is ignored). The list of systems is saved and if a module is loaded, its events will not be added unless the system for those events also match the systems string. Link: https://lore.kernel.org/linux-trace-kernel/20231213093701.03fddec0@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Sean Paul Cc: Arun Easi Cc: Daniel Wagner Tested-by: Dmytro Maluka Signed-off-by: Steven Rostedt (Google) --- include/linux/trace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace.h b/include/linux/trace.h index 2a70a447184c9..fdcd76b7be83d 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -51,7 +51,7 @@ int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...); int trace_array_init_printk(struct trace_array *tr); void trace_array_put(struct trace_array *tr); -struct trace_array *trace_array_get_by_name(const char *name); +struct trace_array *trace_array_get_by_name(const char *name, const char *systems); int trace_array_destroy(struct trace_array *tr); /* For osnoise tracer */ @@ -84,7 +84,7 @@ static inline int trace_array_init_printk(struct trace_array *tr) static inline void trace_array_put(struct trace_array *tr) { } -static inline struct trace_array *trace_array_get_by_name(const char *name) +static inline struct trace_array *trace_array_get_by_name(const char *name, const char *systems) { return NULL; } -- cgit v1.2.3 From 8ec90be7f15fac42992ea821be929d3b06cd0fd9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 12 Dec 2023 13:19:01 -0500 Subject: tracing: Allow for max buffer data size trace_marker writes Allow a trace write to be as big as the ring buffer tracing data will allow. Currently, it only allows writes of 1KB in size, but there's no reason that it cannot allow what the ring buffer can hold. Link: https://lore.kernel.org/linux-trace-kernel/20231212131901.5f501e72@gandalf.local.home Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 782e14f62201f..b1b03b2c0f08e 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -141,6 +141,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter); bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter); unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu); +unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer); void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu); void ring_buffer_reset_online_cpus(struct trace_buffer *buffer); -- cgit v1.2.3 From 40fc60e36c60ba85b2974e507b67df40c94e9578 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sat, 9 Dec 2023 17:52:20 -0500 Subject: trace_seq: Increase the buffer size to almost two pages Now that trace_marker can hold more than 1KB string, and can write as much as the ring buffer can hold, the trace_seq is not big enough to hold writes: ~# a="1234567890" ~# cnt=4080 ~# s="" ~# while [ $cnt -gt 10 ]; do ~# s="${s}${a}" ~# cnt=$((cnt-10)) ~# done ~# echo $s > trace_marker ~# cat trace # tracer: nop # # entries-in-buffer/entries-written: 2/2 #P:8 # # _-----=> irqs-off/BH-disabled # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / _-=> migrate-disable # |||| / delay # TASK-PID CPU# ||||| TIMESTAMP FUNCTION # | | | ||||| | | <...>-860 [002] ..... 105.543465: tracing_mark_write[LINE TOO BIG] <...>-860 [002] ..... 105.543496: tracing_mark_write: 789012345678901234567890 By increasing the trace_seq buffer to almost two pages, it can now print out the first line. This also subtracts the rest of the trace_seq fields from the buffer, so that the entire trace_seq is now PAGE_SIZE aligned. Link: https://lore.kernel.org/linux-trace-kernel/20231209175220.19867af4@gandalf.local.home Cc: Mark Rutland Cc: Mathieu Desnoyers Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_seq.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 3691e0e76a1a2..9ec229dfddaa7 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -8,11 +8,14 @@ /* * Trace sequences are used to allow a function to call several other functions - * to create a string of data to use (up to a max of PAGE_SIZE). + * to create a string of data to use. */ +#define TRACE_SEQ_BUFFER_SIZE (PAGE_SIZE * 2 - \ + (sizeof(struct seq_buf) + sizeof(size_t) + sizeof(int))) + struct trace_seq { - char buffer[PAGE_SIZE]; + char buffer[TRACE_SEQ_BUFFER_SIZE]; struct seq_buf seq; size_t readpos; int full; @@ -21,7 +24,7 @@ struct trace_seq { static inline void trace_seq_init(struct trace_seq *s) { - seq_buf_init(&s->seq, s->buffer, PAGE_SIZE); + seq_buf_init(&s->seq, s->buffer, TRACE_SEQ_BUFFER_SIZE); s->full = 0; s->readpos = 0; } -- cgit v1.2.3 From b9670ee2e975e1cb6751019d5dc5c193aecd8ba2 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sun, 17 Dec 2023 12:07:02 -0800 Subject: Revert "iio: hid-sensor-als: Add light chromaticity support" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit ee3710f39f9d0ae5137a866138d005fe1ad18132. This commit assumes that every HID descriptor for ALS sensor has presence of usage id ID HID_USAGE_SENSOR_LIGHT_CHROMATICITY_X and HID_USAGE_SENSOR_LIGHT_CHROMATICITY_Y. When the above usage ids are absent, driver probe fails. This breaks ALS sensor functionality on many platforms. Till we have a good solution, revert this commit. Reported-by: Thomas Weißschuh Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218223 Signed-off-by: Srinivas Pandruvada Cc: Acked-by: Jonathan Cameron Link: https://lore.kernel.org/r/20231217200703.719876-2-srinivas.pandruvada@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/hid-sensor-ids.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h index 6730ee900ee1c..8af4fb3e0254e 100644 --- a/include/linux/hid-sensor-ids.h +++ b/include/linux/hid-sensor-ids.h @@ -22,9 +22,6 @@ #define HID_USAGE_SENSOR_DATA_LIGHT 0x2004d0 #define HID_USAGE_SENSOR_LIGHT_ILLUM 0x2004d1 #define HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE 0x2004d2 -#define HID_USAGE_SENSOR_LIGHT_CHROMATICITY 0x2004d3 -#define HID_USAGE_SENSOR_LIGHT_CHROMATICITY_X 0x2004d4 -#define HID_USAGE_SENSOR_LIGHT_CHROMATICITY_Y 0x2004d5 /* PROX (200011) */ #define HID_USAGE_SENSOR_PROX 0x200011 -- cgit v1.2.3 From d4005431673929a1259ad791db87408fcf85d2cc Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sun, 17 Dec 2023 12:07:03 -0800 Subject: Revert "iio: hid-sensor-als: Add light color temperature support" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 5f05285df691b1e82108eead7165feae238c95ef. This commit assumes that every HID descriptor for ALS sensor has presence of usage id ID HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE. When the above usage id is absent, driver probe fails. This breaks ALS sensor functionality on many platforms. Till we have a good solution, revert this commit. Reported-by: Thomas Weißschuh Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218223 Signed-off-by: Srinivas Pandruvada Cc: Acked-by: Jonathan Cameron Link: https://lore.kernel.org/r/20231217200703.719876-3-srinivas.pandruvada@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/hid-sensor-ids.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h index 8af4fb3e0254e..13b1e65fbdccb 100644 --- a/include/linux/hid-sensor-ids.h +++ b/include/linux/hid-sensor-ids.h @@ -21,7 +21,6 @@ #define HID_USAGE_SENSOR_ALS 0x200041 #define HID_USAGE_SENSOR_DATA_LIGHT 0x2004d0 #define HID_USAGE_SENSOR_LIGHT_ILLUM 0x2004d1 -#define HID_USAGE_SENSOR_LIGHT_COLOR_TEMPERATURE 0x2004d2 /* PROX (200011) */ #define HID_USAGE_SENSOR_PROX 0x200011 -- cgit v1.2.3 From 7259eb7b534735b9c1153654c0bb4c5f059c0dd3 Mon Sep 17 00:00:00 2001 From: Moti Haimovski Date: Sun, 12 Nov 2023 18:07:10 +0200 Subject: accel/habanalabs/gaudi2: add signed dev info uAPI User will provide a nonce via the INFO ioctl, and will retrieve the signed device info generated using given nonce. Signed-off-by: Moti Haimovski Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- include/linux/habanalabs/cpucp_if.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h index 86ea7c63a0d29..f316c8d0f3fc9 100644 --- a/include/linux/habanalabs/cpucp_if.h +++ b/include/linux/habanalabs/cpucp_if.h @@ -659,6 +659,12 @@ enum pq_init_status { * number (nonce) provided by the host to prevent replay attacks. * public key and certificate also provided as part of the FW response. * + * CPUCP_PACKET_INFO_SIGNED_GET - + * Get the device information signed by the Trusted Platform device. + * device info data is also hashed with some unique number (nonce) provided + * by the host to prevent replay attacks. public key and certificate also + * provided as part of the FW response. + * * CPUCP_PACKET_MONITOR_DUMP_GET - * Get monitors registers dump from the CpuCP kernel. * The CPU will put the registers dump in the a buffer allocated by the driver @@ -733,7 +739,7 @@ enum cpucp_packet_id { CPUCP_PACKET_ENGINE_CORE_ASID_SET, /* internal */ CPUCP_PACKET_RESERVED2, /* not used */ CPUCP_PACKET_SEC_ATTEST_GET, /* internal */ - CPUCP_PACKET_RESERVED3, /* not used */ + CPUCP_PACKET_INFO_SIGNED_GET, /* internal */ CPUCP_PACKET_RESERVED4, /* not used */ CPUCP_PACKET_MONITOR_DUMP_GET, /* debugfs */ CPUCP_PACKET_RESERVED5, /* not used */ -- cgit v1.2.3 From 403863e985e8eba608d53b2907caaf37b6176290 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 16 Dec 2023 13:29:58 +0100 Subject: netlink: introduce typedef for filter function Make the code using filter function a bit nicer by consolidating the filter function arguments using typedef. Suggested-by: Andy Shevchenko Signed-off-by: Jiri Pirko Signed-off-by: Paolo Abeni --- include/linux/connector.h | 3 +-- include/linux/netlink.h | 6 ++++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/connector.h b/include/linux/connector.h index cec2d99ae9021..70bc1160f3d8d 100644 --- a/include/linux/connector.h +++ b/include/linux/connector.h @@ -100,8 +100,7 @@ void cn_del_callback(const struct cb_id *id); */ int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 group, gfp_t gfp_mask, - int (*filter)(struct sock *dsk, struct sk_buff *skb, - void *data), + netlink_filter_fn filter, void *filter_data); /** diff --git a/include/linux/netlink.h b/include/linux/netlink.h index abe91ed6b9aa0..1a4445bf2ab9a 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -228,10 +228,12 @@ bool netlink_strict_get_check(struct sk_buff *skb); int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); + +typedef int (*netlink_filter_fn)(struct sock *dsk, struct sk_buff *skb, void *data); + int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation, - int (*filter)(struct sock *dsk, - struct sk_buff *skb, void *data), + netlink_filter_fn filter, void *filter_data); int netlink_set_err(struct sock *ssk, __u32 portid, __u32 group, int code); int netlink_register_notifier(struct notifier_block *nb); -- cgit v1.2.3 From 67ba055dd7758c34f6e64c9d35132362c1e1f0b5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 19 Dec 2023 17:40:12 +0200 Subject: regulator: Reuse LINEAR_RANGE() in REGULATOR_LINEAR_RANGE() REGULATOR_LINEAR_RANGE() repeats what LINEAR_RANGE() provides. Deduplicate the former by using the latter. No functional change intended. Signed-off-by: Andy Shevchenko Link: https://msgid.link/r/20231219154012.2478688-1-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index 4b7eceb3828b0..22a07c0900a41 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -51,12 +51,7 @@ enum regulator_detection_severity { /* Initialize struct linear_range for regulators */ #define REGULATOR_LINEAR_RANGE(_min_uV, _min_sel, _max_sel, _step_uV) \ -{ \ - .min = _min_uV, \ - .min_sel = _min_sel, \ - .max_sel = _max_sel, \ - .step = _step_uV, \ -} + LINEAR_RANGE(_min_uV, _min_sel, _max_sel, _step_uV) /** * struct regulator_ops - regulator operations. -- cgit v1.2.3 From d17aff807f845cf93926c28705216639c7279110 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Dec 2023 07:37:35 -0800 Subject: Revert BPF token-related functionality This patch includes the following revert (one conflicting BPF FS patch and three token patch sets, represented by merge commits): - revert 0f5d5454c723 "Merge branch 'bpf-fs-mount-options-parsing-follow-ups'"; - revert 750e785796bb "bpf: Support uid and gid when mounting bpffs"; - revert 733763285acf "Merge branch 'bpf-token-support-in-libbpf-s-bpf-object'"; - revert c35919dcce28 "Merge branch 'bpf-token-and-bpf-fs-based-delegation'". Link: https://lore.kernel.org/bpf/CAHk-=wg7JuFYwGy=GOMbRCtOL+jwSQsdUaBsRWkDVYbxipbM5A@mail.gmail.com Signed-off-by: Andrii Nakryiko --- include/linux/bpf.h | 85 +++++-------------------------------------- include/linux/filter.h | 2 +- include/linux/lsm_hook_defs.h | 15 ++------ include/linux/security.h | 43 ++++------------------ 4 files changed, 22 insertions(+), 123 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2f54cc0436c4d..7a8d4c81a39a1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -52,10 +52,6 @@ struct module; struct bpf_func_state; struct ftrace_ops; struct cgroup; -struct bpf_token; -struct user_namespace; -struct super_block; -struct inode; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -1488,7 +1484,6 @@ struct bpf_prog_aux { #ifdef CONFIG_SECURITY void *security; #endif - struct bpf_token *token; struct bpf_prog_offload *offload; struct btf *btf; struct bpf_func_info *func_info; @@ -1613,31 +1608,6 @@ struct bpf_link_primer { u32 id; }; -struct bpf_mount_opts { - kuid_t uid; - kgid_t gid; - umode_t mode; - - /* BPF token-related delegation options */ - u64 delegate_cmds; - u64 delegate_maps; - u64 delegate_progs; - u64 delegate_attachs; -}; - -struct bpf_token { - struct work_struct work; - atomic64_t refcnt; - struct user_namespace *userns; - u64 allowed_cmds; - u64 allowed_maps; - u64 allowed_progs; - u64 allowed_attachs; -#ifdef CONFIG_SECURITY - void *security; -#endif -}; - struct bpf_struct_ops_value; struct btf_member; @@ -2097,7 +2067,6 @@ static inline void bpf_enable_instrumentation(void) migrate_enable(); } -extern const struct super_operations bpf_super_ops; extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; @@ -2232,26 +2201,24 @@ static inline void bpf_map_dec_elem_count(struct bpf_map *map) extern int sysctl_unprivileged_bpf_disabled; -bool bpf_token_capable(const struct bpf_token *token, int cap); - -static inline bool bpf_allow_ptr_leaks(const struct bpf_token *token) +static inline bool bpf_allow_ptr_leaks(void) { - return bpf_token_capable(token, CAP_PERFMON); + return perfmon_capable(); } -static inline bool bpf_allow_uninit_stack(const struct bpf_token *token) +static inline bool bpf_allow_uninit_stack(void) { - return bpf_token_capable(token, CAP_PERFMON); + return perfmon_capable(); } -static inline bool bpf_bypass_spec_v1(const struct bpf_token *token) +static inline bool bpf_bypass_spec_v1(void) { - return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON); + return cpu_mitigations_off() || perfmon_capable(); } -static inline bool bpf_bypass_spec_v4(const struct bpf_token *token) +static inline bool bpf_bypass_spec_v4(void) { - return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON); + return cpu_mitigations_off() || perfmon_capable(); } int bpf_map_new_fd(struct bpf_map *map, int flags); @@ -2268,21 +2235,8 @@ int bpf_link_new_fd(struct bpf_link *link); struct bpf_link *bpf_link_get_from_fd(u32 ufd); struct bpf_link *bpf_link_get_curr_or_next(u32 *id); -void bpf_token_inc(struct bpf_token *token); -void bpf_token_put(struct bpf_token *token); -int bpf_token_create(union bpf_attr *attr); -struct bpf_token *bpf_token_get_from_fd(u32 ufd); - -bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); -bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type); -bool bpf_token_allow_prog_type(const struct bpf_token *token, - enum bpf_prog_type prog_type, - enum bpf_attach_type attach_type); - int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); -struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir, - umode_t mode); #define BPF_ITER_FUNC_PREFIX "bpf_iter_" #define DEFINE_BPF_ITER_FUNC(target, args...) \ @@ -2526,8 +2480,7 @@ const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type struct bpf_prog *bpf_prog_by_id(u32 id); struct bpf_link *bpf_link_by_id(u32 id); -const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id, - const struct bpf_prog *prog); +const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); void bpf_task_storage_free(struct task_struct *task); void bpf_cgrp_storage_free(struct cgroup *cgroup); bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); @@ -2646,24 +2599,6 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags) return -EOPNOTSUPP; } -static inline bool bpf_token_capable(const struct bpf_token *token, int cap) -{ - return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN)); -} - -static inline void bpf_token_inc(struct bpf_token *token) -{ -} - -static inline void bpf_token_put(struct bpf_token *token) -{ -} - -static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd) -{ - return ERR_PTR(-EOPNOTSUPP); -} - static inline void __dev_flush(void) { } @@ -2787,7 +2722,7 @@ static inline int btf_struct_access(struct bpf_verifier_log *log, } static inline const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +bpf_base_func_proto(enum bpf_func_id func_id) { return NULL; } diff --git a/include/linux/filter.h b/include/linux/filter.h index 12d907f17d364..68fb6c8142fec 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1139,7 +1139,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) return false; if (!bpf_jit_harden) return false; - if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF)) + if (bpf_jit_harden == 1 && bpf_capable()) return false; return true; diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 3fdd00b452aca..ff217a5ce5521 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -398,17 +398,10 @@ LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule) LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size) LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode) LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog) -LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token) -LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map) -LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token) -LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) -LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr, - struct path *path) -LSM_HOOK(void, LSM_RET_VOID, bpf_token_free, struct bpf_token *token) -LSM_HOOK(int, 0, bpf_token_cmd, const struct bpf_token *token, enum bpf_cmd cmd) -LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap) +LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map) +LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map) +LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux) +LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux *aux) #endif /* CONFIG_BPF_SYSCALL */ LSM_HOOK(int, 0, locked_down, enum lockdown_reason what) diff --git a/include/linux/security.h b/include/linux/security.h index 00809d2d5c38c..1d1df326c881c 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -32,7 +32,6 @@ #include #include #include -#include struct linux_binprm; struct cred; @@ -2021,22 +2020,15 @@ static inline void securityfs_remove(struct dentry *dentry) union bpf_attr; struct bpf_map; struct bpf_prog; -struct bpf_token; +struct bpf_prog_aux; #ifdef CONFIG_SECURITY extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size); extern int security_bpf_map(struct bpf_map *map, fmode_t fmode); extern int security_bpf_prog(struct bpf_prog *prog); -extern int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token); +extern int security_bpf_map_alloc(struct bpf_map *map); extern void security_bpf_map_free(struct bpf_map *map); -extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token); -extern void security_bpf_prog_free(struct bpf_prog *prog); -extern int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, - struct path *path); -extern void security_bpf_token_free(struct bpf_token *token); -extern int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd); -extern int security_bpf_token_capable(const struct bpf_token *token, int cap); +extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux); +extern void security_bpf_prog_free(struct bpf_prog_aux *aux); #else static inline int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) @@ -2054,8 +2046,7 @@ static inline int security_bpf_prog(struct bpf_prog *prog) return 0; } -static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token) +static inline int security_bpf_map_alloc(struct bpf_map *map) { return 0; } @@ -2063,33 +2054,13 @@ static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *a static inline void security_bpf_map_free(struct bpf_map *map) { } -static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token) +static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux) { return 0; } -static inline void security_bpf_prog_free(struct bpf_prog *prog) +static inline void security_bpf_prog_free(struct bpf_prog_aux *aux) { } - -static inline int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, - struct path *path) -{ - return 0; -} - -static inline void security_bpf_token_free(struct bpf_token *token) -{ } - -static inline int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd) -{ - return 0; -} - -static inline int security_bpf_token_capable(const struct bpf_token *token, int cap) -{ - return 0; -} #endif /* CONFIG_SECURITY */ #endif /* CONFIG_BPF_SYSCALL */ -- cgit v1.2.3 From fd27ef6b44bec26915c5b2b22c13856d9f0ba17a Mon Sep 17 00:00:00 2001 From: Feng Liu Date: Tue, 19 Dec 2023 11:32:40 +0200 Subject: virtio-pci: Introduce admin virtqueue Introduce support for the admin virtqueue. By negotiating VIRTIO_F_ADMIN_VQ feature, driver detects capability and creates one administration virtqueue. Administration virtqueue implementation in virtio pci generic layer, enables multiple types of upper layer drivers such as vfio, net, blk to utilize it. Signed-off-by: Feng Liu Reviewed-by: Parav Pandit Reviewed-by: Jiri Pirko Acked-by: Michael S. Tsirkin Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20231219093247.170936-3-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/linux/virtio_config.h | 4 ++++ include/linux/virtio_pci_modern.h | 2 ++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 2b3438de2c4d4..da9b271b54db8 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -93,6 +93,8 @@ typedef void vq_callback_t(struct virtqueue *); * Returns 0 on success or error status * If disable_vq_and_reset is set, then enable_vq_after_reset must also be * set. + * @create_avq: create admin virtqueue resource. + * @destroy_avq: destroy admin virtqueue resource. */ struct virtio_config_ops { void (*get)(struct virtio_device *vdev, unsigned offset, @@ -120,6 +122,8 @@ struct virtio_config_ops { struct virtio_shm_region *region, u8 id); int (*disable_vq_and_reset)(struct virtqueue *vq); int (*enable_vq_after_reset)(struct virtqueue *vq); + int (*create_avq)(struct virtio_device *vdev); + void (*destroy_avq)(struct virtio_device *vdev); }; /* If driver didn't advertise the feature, it will never appear. */ diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h index a09e13a577a99..c0b1b1ca11635 100644 --- a/include/linux/virtio_pci_modern.h +++ b/include/linux/virtio_pci_modern.h @@ -125,4 +125,6 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev); void vp_modern_remove(struct virtio_pci_modern_device *mdev); int vp_modern_get_queue_reset(struct virtio_pci_modern_device *mdev, u16 index); void vp_modern_set_queue_reset(struct virtio_pci_modern_device *mdev, u16 index); +u16 vp_modern_avq_num(struct virtio_pci_modern_device *mdev); +u16 vp_modern_avq_index(struct virtio_pci_modern_device *mdev); #endif -- cgit v1.2.3 From 92792ac752aa80d5ee71bc291d90edd06cd76bd1 Mon Sep 17 00:00:00 2001 From: Feng Liu Date: Tue, 19 Dec 2023 11:32:41 +0200 Subject: virtio-pci: Introduce admin command sending function Add support for sending admin command through admin virtqueue interface. Abort any inflight admin commands once device reset completes. Activate admin queue when device becomes ready; deactivate on device reset. To comply to the below specification statement [1], the admin virtqueue is activated for upper layer users only after setting DRIVER_OK status. [1] The driver MUST NOT send any buffer available notifications to the device before setting DRIVER_OK. Signed-off-by: Feng Liu Reviewed-by: Parav Pandit Acked-by: Michael S. Tsirkin Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20231219093247.170936-4-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/linux/virtio.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 4cc614a383765..b0201747a263a 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -103,6 +103,14 @@ int virtqueue_resize(struct virtqueue *vq, u32 num, int virtqueue_reset(struct virtqueue *vq, void (*recycle)(struct virtqueue *vq, void *buf)); +struct virtio_admin_cmd { + __le16 opcode; + __le16 group_type; + __le64 group_member_id; + struct scatterlist *data_sg; + struct scatterlist *result_sg; +}; + /** * struct virtio_device - representation of a device using virtio * @index: unique position on the virtio bus -- cgit v1.2.3 From c3fc3e098bd64c560dde49a6e72b21b055150abe Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 19 Dec 2023 11:32:44 +0200 Subject: virtio-pci: Introduce APIs to execute legacy IO admin commands Introduce APIs to execute legacy IO admin commands. It includes: io_legacy_read/write for both common and the device configuration, io_legacy_notify_info. In addition, exposing an API to check whether the legacy IO commands are supported. (i.e. virtio_pci_admin_has_legacy_io()). Those APIs will be used by the next patches from this series. Note: Unlike modern drivers which support hardware virtio devices, legacy drivers assume software-based devices: e.g. they don't use proper memory barriers on ARM, use big endian on PPC, etc. X86 drivers are mostly ok though, more or less by chance. For now, only support legacy IO on X86. Acked-by: Michael S. Tsirkin Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20231219093247.170936-7-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/linux/virtio_pci_admin.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 include/linux/virtio_pci_admin.h (limited to 'include/linux') diff --git a/include/linux/virtio_pci_admin.h b/include/linux/virtio_pci_admin.h new file mode 100644 index 0000000000000..f4a100a0fe2e1 --- /dev/null +++ b/include/linux/virtio_pci_admin.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_VIRTIO_PCI_ADMIN_H +#define _LINUX_VIRTIO_PCI_ADMIN_H + +#include +#include + +#ifdef CONFIG_VIRTIO_PCI_ADMIN_LEGACY +bool virtio_pci_admin_has_legacy_io(struct pci_dev *pdev); +int virtio_pci_admin_legacy_common_io_write(struct pci_dev *pdev, u8 offset, + u8 size, u8 *buf); +int virtio_pci_admin_legacy_common_io_read(struct pci_dev *pdev, u8 offset, + u8 size, u8 *buf); +int virtio_pci_admin_legacy_device_io_write(struct pci_dev *pdev, u8 offset, + u8 size, u8 *buf); +int virtio_pci_admin_legacy_device_io_read(struct pci_dev *pdev, u8 offset, + u8 size, u8 *buf); +int virtio_pci_admin_legacy_io_notify_info(struct pci_dev *pdev, + u8 req_bar_flags, u8 *bar, + u64 *bar_offset); +#endif + +#endif /* _LINUX_VIRTIO_PCI_ADMIN_H */ -- cgit v1.2.3 From 8bccc5b80678c69f7729ce4cd232c0aa98fa6277 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 19 Dec 2023 11:32:45 +0200 Subject: vfio/pci: Expose vfio_pci_core_setup_barmap() Expose vfio_pci_core_setup_barmap() to be used by drivers. This will let drivers to mmap a BAR and re-use it from both vfio and the driver when it's applicable. This API will be used in the next patches by the vfio/virtio coming driver. Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Yishai Hadas Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20231219093247.170936-8-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 562e8754869da..67ac58e20e1da 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -127,6 +127,7 @@ int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf); int vfio_pci_core_enable(struct vfio_pci_core_device *vdev); void vfio_pci_core_disable(struct vfio_pci_core_device *vdev); void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev); +int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar); pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, pci_channel_state_t state); -- cgit v1.2.3 From 8486ae162b3b6cc1055366f044495cf1966231f1 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 19 Dec 2023 11:32:46 +0200 Subject: vfio/pci: Expose vfio_pci_core_iowrite/read##size() Expose vfio_pci_core_iowrite/read##size() to let it be used by drivers. This functionality is needed to enable direct access to some physical BAR of the device with the proper locks/checks in place. The next patches from this series will use this functionality on a data path flow when a direct access to the BAR is needed. Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Yishai Hadas Acked-by: Michael S. Tsirkin Link: https://lore.kernel.org/r/20231219093247.170936-9-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 67ac58e20e1da..85e84b92751b6 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -131,4 +131,23 @@ int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar); pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, pci_channel_state_t state); +#define VFIO_IOWRITE_DECLATION(size) \ +int vfio_pci_core_iowrite##size(struct vfio_pci_core_device *vdev, \ + bool test_mem, u##size val, void __iomem *io); + +VFIO_IOWRITE_DECLATION(8) +VFIO_IOWRITE_DECLATION(16) +VFIO_IOWRITE_DECLATION(32) +#ifdef iowrite64 +VFIO_IOWRITE_DECLATION(64) +#endif + +#define VFIO_IOREAD_DECLATION(size) \ +int vfio_pci_core_ioread##size(struct vfio_pci_core_device *vdev, \ + bool test_mem, u##size *val, void __iomem *io); + +VFIO_IOREAD_DECLATION(8) +VFIO_IOREAD_DECLATION(16) +VFIO_IOREAD_DECLATION(32) + #endif /* VFIO_PCI_CORE_H */ -- cgit v1.2.3 From a4104821ad651d8a0b374f0b2474c345bbb42f82 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 19 Dec 2023 12:30:43 -0700 Subject: io_uring/unix: drop usage of io_uring socket Since we no longer allow sending io_uring fds over SCM_RIGHTS, move to using io_is_uring_fops() to detect whether this is a io_uring fd or not. With that done, kill off io_uring_get_socket() as nobody calls it anymore. This is in preparation to yanking out the rest of the core related to unix gc with io_uring. Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index d8fc93492dc50..68ed6697fece3 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -7,12 +7,12 @@ #include #if defined(CONFIG_IO_URING) -struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); void io_uring_unreg_ringfd(void); const char *io_uring_get_opcode(u8 opcode); int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); +bool io_is_uring_fops(struct file *file); static inline void io_uring_files_cancel(void) { @@ -32,10 +32,6 @@ static inline void io_uring_free(struct task_struct *tsk) __io_uring_free(tsk); } #else -static inline struct sock *io_uring_get_socket(struct file *file) -{ - return NULL; -} static inline void io_uring_task_cancel(void) { } @@ -54,6 +50,10 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd, { return -EOPNOTSUPP; } +static inline bool io_is_uring_fops(struct file *file) +{ + return false; +} #endif #endif -- cgit v1.2.3 From 6e5e6d274956305f1fc0340522b38f5f5be74bdb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 19 Dec 2023 12:36:34 -0700 Subject: io_uring: drop any code related to SCM_RIGHTS This is dead code after we dropped support for passing io_uring fds over SCM_RIGHTS, get rid of it. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index bebab36abce89..fc8f2570b92b3 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -389,9 +389,6 @@ struct io_ring_ctx { struct wait_queue_head rsrc_quiesce_wq; unsigned rsrc_quiesce; - #if defined(CONFIG_UNIX) - struct socket *ring_sock; - #endif /* hashed buffered write serialization */ struct io_wq_hash *hash_map; -- cgit v1.2.3 From ee0cf5e07f44a10fce8f1bfa9db226c0b5ecf880 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Mon, 18 Dec 2023 18:14:16 +0100 Subject: clk: fixed-rate: fix clk_hw_register_fixed_rate_with_accuracy_parent_hw MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add missing comma and remove extraneous NULL argument. The macro is currently used by no one which explains why the typo slipped by. Fixes: 2d34f09e79c9 ("clk: fixed-rate: Add support for specifying parents via DT/pointers") Signed-off-by: Théo Lebrun Link: https://lore.kernel.org/r/20231218-mbly-clk-v1-1-44ce54108f06@bootlin.com Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index ace3a4ce2fc98..1293c38ddb7f7 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -448,8 +448,8 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name, */ #define clk_hw_register_fixed_rate_with_accuracy_parent_hw(dev, name, \ parent_hw, flags, fixed_rate, fixed_accuracy) \ - __clk_hw_register_fixed_rate((dev), NULL, (name), NULL, (parent_hw) \ - NULL, NULL, (flags), (fixed_rate), \ + __clk_hw_register_fixed_rate((dev), NULL, (name), NULL, (parent_hw), \ + NULL, (flags), (fixed_rate), \ (fixed_accuracy), 0, false) /** * clk_hw_register_fixed_rate_with_accuracy_parent_data - register fixed-rate -- cgit v1.2.3 From 4ba1d0f23414135e4f426dae4cb5cdc2ce246f89 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Dec 2023 17:13:25 -0800 Subject: bpf: abstract away global subprog arg preparation logic from reg state setup btf_prepare_func_args() is used to understand expectations and restrictions on global subprog arguments. But current implementation is hard to extend, as it intermixes BTF-based func prototype parsing and interpretation logic with setting up register state at subprog entry. Worse still, those registers are not completely set up inside btf_prepare_func_args(), requiring some more logic later in do_check_common(). Like calling mark_reg_unknown() and similar initialization operations. This intermixing of BTF interpretation and register state setup is problematic. First, it causes duplication of BTF parsing logic for global subprog verification (to set up initial state of global subprog) and global subprog call sites analysis (when we need to check that whatever is being passed into global subprog matches expectations), performed in btf_check_subprog_call(). Given we want to extend global func argument with tags later, this duplication is problematic. So refactor btf_prepare_func_args() to do only BTF-based func proto and args parsing, returning high-level argument "expectations" only, with no regard to specifics of register state. I.e., if it's a context argument, instead of setting register state to PTR_TO_CTX, we return ARG_PTR_TO_CTX enum for that argument as "an argument specification" for further processing inside do_check_common(). Similarly for SCALAR arguments, PTR_TO_MEM, etc. This allows to reuse btf_prepare_func_args() in following patches at global subprog call site analysis time. It also keeps register setup code consistently in one place, do_check_common(). Besides all this, we cache this argument specs information inside env->subprog_info, eliminating the need to redo these potentially expensive BTF traversals, especially if BPF program's BTF is big and/or there are lots of global subprog calls. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231215011334.2307144-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +-- include/linux/bpf_verifier.h | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7a8d4c81a39a1..c050c82cc9a5f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2470,8 +2470,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); -int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *reg, u32 *nargs); +int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c2819a6579a52..5742e9c0a7b86 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -606,6 +606,13 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) #define BPF_MAX_SUBPROGS 256 +struct bpf_subprog_arg_info { + enum bpf_arg_type arg_type; + union { + u32 mem_size; + }; +}; + struct bpf_subprog_info { /* 'start' has to be the first field otherwise find_subprog() won't work */ u32 start; /* insn idx of function entry point */ @@ -617,6 +624,10 @@ struct bpf_subprog_info { bool is_cb: 1; bool is_async_cb: 1; bool is_exception_cb: 1; + bool args_cached: 1; + + u8 arg_cnt; + struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; }; struct bpf_verifier_env; @@ -727,6 +738,11 @@ struct bpf_verifier_env { char tmp_str_buf[TMP_STR_BUF_LEN]; }; +static inline struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog) +{ + return &env->subprog_info[subprog]; +} + __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, -- cgit v1.2.3 From 5eccd2db42d77e3570619c32d39e39bf486607cf Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Dec 2023 17:13:26 -0800 Subject: bpf: reuse btf_prepare_func_args() check for main program BTF validation Instead of btf_check_subprog_arg_match(), use btf_prepare_func_args() logic to validate "trustworthiness" of main BPF program's BTF information, if it is present. We ignored results of original BTF check anyway, often times producing confusing and ominously-sounding "reg type unsupported for arg#0 function" message, which has no apparent effect on program correctness and verification process. All the -EFAULT returning sanity checks are already performed in check_btf_info_early(), so there is zero reason to have this duplication of logic between btf_check_subprog_call() and btf_check_subprog_arg_match(). Dropping btf_check_subprog_arg_match() simplifies btf_check_func_arg_match() further removing `bool processing_call` flag. One subtle bit that was done by btf_check_subprog_arg_match() was potentially marking main program's BTF as unreliable. We do this explicitly now with a dedicated simple check, preserving the original behavior, but now based on well factored btf_prepare_func_args() logic. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231215011334.2307144-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c050c82cc9a5f..d0d7eff22b8a9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2466,8 +2466,6 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf_func_model *m); struct bpf_reg_state; -int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs); int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog); -- cgit v1.2.3 From e26080d0da87f20222ca6712b65f95a856fadee0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Dec 2023 17:13:27 -0800 Subject: bpf: prepare btf_prepare_func_args() for handling static subprogs Generalize btf_prepare_func_args() to support both global and static subprogs. We are going to utilize this property in the next patch, reusing btf_prepare_func_args() for subprog call logic instead of reparsing BTF information in a completely separate implementation. btf_prepare_func_args() now detects whether subprog is global or static makes slight logic adjustments for static func cases, like not failing fatally (-EFAULT) for conditions that are allowable for static subprogs. Somewhat subtle (but major!) difference is the handling of pointer arguments. Both global and static functions need to handle special context arguments (which are pointers to predefined type names), but static subprogs give up on any other pointers, falling back to marking subprog as "unreliable", disabling the use of BTF type information altogether. For global functions, though, we are assuming that such pointers to unrecognized types are just pointers to fixed-sized memory region (or error out if size cannot be established, like for `void *` pointers). This patch accommodates these small differences and sets up a stage for refactoring in the next patch, eliminating a separate BTF-based parsing logic in btf_check_func_arg_match(). Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231215011334.2307144-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5742e9c0a7b86..d3ea9ef047677 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -738,6 +738,11 @@ struct bpf_verifier_env { char tmp_str_buf[TMP_STR_BUF_LEN]; }; +static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) +{ + return &env->prog->aux->func_info_aux[subprog]; +} + static inline struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog) { return &env->subprog_info[subprog]; -- cgit v1.2.3 From c5a7244759b1eeacc59d0426fb73859afa942d0d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Dec 2023 17:13:28 -0800 Subject: bpf: move subprog call logic back to verifier.c Subprog call logic in btf_check_subprog_call() currently has both a lot of BTF parsing logic (which is, presumably, what justified putting it into btf.c), but also a bunch of register state checks, some of each utilize deep verifier logic helpers, necessarily exported from verifier.c: check_ptr_off_reg(), check_func_arg_reg_off(), and check_mem_reg(). Going forward, btf_check_subprog_call() will have a minimum of BTF-related logic, but will get more internal verifier logic related to register state manipulation. So move it into verifier.c to minimize amount of verifier-specific logic exposed to btf.c. We do this move before refactoring btf_check_func_arg_match() to preserve as much history post-refactoring as possible. No functional changes. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231215011334.2307144-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 -- include/linux/bpf_verifier.h | 8 -------- 2 files changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d0d7eff22b8a9..7671530d6e4e0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2466,8 +2466,6 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf_func_model *m); struct bpf_reg_state; -int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d3ea9ef047677..d07d857ca67fe 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -785,14 +785,6 @@ bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, void bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); -int check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno); -int check_func_arg_reg_off(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno, - enum bpf_arg_type arg_type); -int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - u32 regno, u32 mem_size); - /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, struct btf *btf, u32 btf_id) -- cgit v1.2.3 From 7437bb73f087e5f216f9c6603f5149d354e315af Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 17 Dec 2023 17:53:57 +0100 Subject: block: remove support for the host aware zone model When zones were first added the SCSI and ATA specs, two different models were supported (in addition to the drive managed one that is invisible to the host): - host managed where non-conventional zones there is strict requirement to write at the write pointer, or else an error is returned - host aware where a write point is maintained if writes always happen at it, otherwise it is left in an under-defined state and the sequential write preferred zones behave like conventional zones (probably very badly performing ones, though) Not surprisingly this lukewarm model didn't prove to be very useful and was finally removed from the ZBC and SBC specs (NVMe never implemented it). Due to to the easily disappearing write pointer host software could never rely on the write pointer to actually be useful for say recovery. Fortunately only a few HDD prototypes shipped using this model which never made it to mass production. Drop the support before it is too late. Note that any such host aware prototype HDD can still be used with Linux as we'll now treat it as a conventional HDD. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20231217165359.604246-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 37 +++---------------------------------- 1 file changed, 3 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 185ed3770e3a9..28cda9fb239eb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -263,18 +263,6 @@ static inline bool blk_op_is_passthrough(blk_opf_t op) return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT; } -/* - * Zoned block device models (zoned limit). - * - * Note: This needs to be ordered from the least to the most severe - * restrictions for the inheritance in blk_stack_limits() to work. - */ -enum blk_zoned_model { - BLK_ZONED_NONE = 0, /* Regular block device */ - BLK_ZONED_HA, /* Host-aware zoned block device */ - BLK_ZONED_HM, /* Host-managed zoned block device */ -}; - /* * BLK_BOUNCE_NONE: never bounce (default) * BLK_BOUNCE_HIGH: bounce all highmem pages @@ -316,7 +304,7 @@ struct queue_limits { unsigned char misaligned; unsigned char discard_misaligned; unsigned char raid_partial_stripes_expensive; - enum blk_zoned_model zoned; + bool zoned; /* * Drivers that set dma_alignment to less than 511 must be prepared to @@ -329,7 +317,7 @@ struct queue_limits { typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, void *data); -void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model); +void disk_set_zoned(struct gendisk *disk, bool zoned); #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, @@ -617,23 +605,9 @@ static inline enum rpm_status queue_rpm_status(struct request_queue *q) } #endif -static inline enum blk_zoned_model -blk_queue_zoned_model(struct request_queue *q) -{ - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) - return q->limits.zoned; - return BLK_ZONED_NONE; -} - static inline bool blk_queue_is_zoned(struct request_queue *q) { - switch (blk_queue_zoned_model(q)) { - case BLK_ZONED_HA: - case BLK_ZONED_HM: - return true; - default: - return false; - } + return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && q->limits.zoned; } #ifdef CONFIG_BLK_DEV_ZONED @@ -1260,11 +1234,6 @@ static inline bool bdev_nowait(struct block_device *bdev) return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags); } -static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) -{ - return blk_queue_zoned_model(bdev_get_queue(bdev)); -} - static inline bool bdev_is_zoned(struct block_device *bdev) { return blk_queue_is_zoned(bdev_get_queue(bdev)); -- cgit v1.2.3 From d73e93b4dfab10c80688b061c30048df05585c7e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 17 Dec 2023 17:53:58 +0100 Subject: block: simplify disk_set_zoned Only use disk_set_zoned to actually enable zoned device support. For clearing it, call disk_clear_zoned, which is renamed from disk_clear_zone_settings and now directly clears the zoned flag as well. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20231217165359.604246-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 28cda9fb239eb..bc236e77d85e1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -317,7 +317,8 @@ struct queue_limits { typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, void *data); -void disk_set_zoned(struct gendisk *disk, bool zoned); +void disk_set_zoned(struct gendisk *disk); +void disk_clear_zoned(struct gendisk *disk); #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, -- cgit v1.2.3 From 8be0c877fb3b671dac0cf56d1f1f9e65f9a9fb81 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 19 Dec 2023 16:43:05 +0100 Subject: thunderbolt: make tb_bus_type const Now that the driver core can properly handle constant struct bus_type, move the tb_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Andreas Noever Cc: Michael Jamet Cc: Yehezkel Bernat Cc: Acked-by: Mika Westerberg Link: https://lore.kernel.org/r/2023121904-utopia-broadcast-06d1@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/thunderbolt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 6151c210d987d..2c835e5c41f63 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -86,7 +86,7 @@ struct tb { unsigned long privdata[]; }; -extern struct bus_type tb_bus_type; +extern const struct bus_type tb_bus_type; extern struct device_type tb_service_type; extern struct device_type tb_xdomain_type; -- cgit v1.2.3 From 1a36e0f50f963465e9b2b980d250ab38b8fcd7a3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 17 Dec 2023 10:32:38 +0200 Subject: net: Add MDB bulk deletion device operation Add MDB net device operation that will be invoked by rtnetlink code in response to received 'RTM_DELMDB' messages with the 'NLM_F_BULK' flag set. Subsequent patches will implement the operation in the bridge and VXLAN drivers. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1b935ee341b42..75c7725e5e4fd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1329,6 +1329,9 @@ struct netdev_net_notifier { * int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[], * struct netlink_ext_ack *extack); * Deletes the MDB entry from dev. + * int (*ndo_mdb_del_bulk)(struct net_device *dev, struct nlattr *tb[], + * struct netlink_ext_ack *extack); + * Bulk deletes MDB entries from dev. * int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb, * struct netlink_callback *cb); * Dumps MDB entries from dev. The first argument (marker) in the netlink @@ -1611,6 +1614,9 @@ struct net_device_ops { int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack); + int (*ndo_mdb_del_bulk)(struct net_device *dev, + struct nlattr *tb[], + struct netlink_ext_ack *extack); int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb); -- cgit v1.2.3 From 40ca4ee3136d2d09977d1cab8c0c0e1582c3359d Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Tue, 12 Dec 2023 06:12:43 -0500 Subject: evm: don't copy up 'security.evm' xattr The security.evm HMAC and the original file signatures contain filesystem specific data. As a result, the HMAC and signature are not the same on the stacked and backing filesystems. Don't copy up 'security.evm'. Reviewed-by: Amir Goldstein Reviewed-by: Christian Brauner Signed-off-by: Mimi Zohar --- include/linux/evm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/evm.h b/include/linux/evm.h index 01fc495a83e27..36ec884320d9f 100644 --- a/include/linux/evm.h +++ b/include/linux/evm.h @@ -31,6 +31,7 @@ extern void evm_inode_post_setxattr(struct dentry *dentry, const char *xattr_name, const void *xattr_value, size_t xattr_value_len); +extern int evm_inode_copy_up_xattr(const char *name); extern int evm_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name); extern void evm_inode_post_removexattr(struct dentry *dentry, @@ -117,6 +118,11 @@ static inline void evm_inode_post_setxattr(struct dentry *dentry, return; } +static inline int evm_inode_copy_up_xattr(const char *name) +{ + return 0; +} + static inline int evm_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *xattr_name) -- cgit v1.2.3 From cd708c938f055c9eb5a366ec1c8edcefa28afc28 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Mon, 18 Dec 2023 08:06:40 -0500 Subject: evm: add support to disable EVM on unsupported filesystems Identify EVM unsupported filesystems by defining a new flag SB_I_EVM_UNSUPPORTED. Don't verify, write, remove or update 'security.evm' on unsupported filesystems. Acked-by: Amir Goldstein Reviewed-by: Christian Brauner Signed-off-by: Mimi Zohar --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e3..1474f36e9b38b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1164,6 +1164,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 +#define SB_I_EVM_UNSUPPORTED 0x00000080 #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ -- cgit v1.2.3 From 139f84002145d8624f0195fb090b3a7670744a13 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Tue, 19 Dec 2023 13:54:16 -0500 Subject: ring-buffer: Page size per ring buffer Currently the size of one sub buffer page is global for all buffers and it is hard coded to one system page. In order to introduce configurable ring buffer sub page size, the internal logic should be refactored to work with sub page size per ring buffer. Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-3-tz.stoyanov@gmail.com Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.009147038@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Tzvetomir Stoyanov (VMware) Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index b1b03b2c0f08e..ce46218ce46df 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -200,7 +200,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page, struct trace_seq; int ring_buffer_print_entry_header(struct trace_seq *s); -int ring_buffer_print_page_header(struct trace_seq *s); +int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s); enum ring_buffer_flags { RB_FL_OVERWRITE = 1 << 0, -- cgit v1.2.3 From 2808e31ec12e5fbe2ae25acc027fcdc67b1fb7f0 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Tue, 19 Dec 2023 13:54:17 -0500 Subject: ring-buffer: Add interface for configuring trace sub buffer size The trace ring buffer sub page size can be configured, per trace instance. A new ftrace file "buffer_subbuf_order" is added to get and set the size of the ring buffer sub page for current trace instance. The size must be an order of system page size, that's why the new interface works with system page order, instead of absolute page size: 0 means the ring buffer sub page is equal to 1 system page and so forth: 0 - 1 system page 1 - 2 system pages 2 - 4 system pages ... The ring buffer sub page size is limited between 1 and 128 system pages. The default value is 1 system page. New ring buffer APIs are introduced: ring_buffer_subbuf_order_set() ring_buffer_subbuf_order_get() ring_buffer_subbuf_size_get() Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-4-tz.stoyanov@gmail.com Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.298324722@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Tzvetomir Stoyanov (VMware) Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index ce46218ce46df..12573306b8892 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -202,6 +202,10 @@ struct trace_seq; int ring_buffer_print_entry_header(struct trace_seq *s); int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s); +int ring_buffer_subbuf_order_get(struct trace_buffer *buffer); +int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order); +int ring_buffer_subbuf_size_get(struct trace_buffer *buffer); + enum ring_buffer_flags { RB_FL_OVERWRITE = 1 << 0, }; -- cgit v1.2.3 From bce761d757452ba5eb77e11fecc37a04b67494e7 Mon Sep 17 00:00:00 2001 From: "Tzvetomir Stoyanov (VMware)" Date: Tue, 19 Dec 2023 13:54:19 -0500 Subject: ring-buffer: Read and write to ring buffers with custom sub buffer size As the size of the ring sub buffer page can be changed dynamically, the logic that reads and writes to the buffer should be fixed to take that into account. Some internal ring buffer APIs are changed: ring_buffer_alloc_read_page() ring_buffer_free_read_page() ring_buffer_read_page() A new API is introduced: ring_buffer_read_page_data() Link: https://lore.kernel.org/linux-trace-devel/20211213094825.61876-6-tz.stoyanov@gmail.com Link: https://lore.kernel.org/linux-trace-kernel/20231219185628.875145995@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Kent Overstreet Signed-off-by: Tzvetomir Stoyanov (VMware) [ Fixed kerneldoc on data_page parameter in ring_buffer_free_read_page() ] Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 12573306b8892..fa802db216f94 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -192,10 +192,15 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer); size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu); size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu); -void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu); -void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data); -int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page, +struct buffer_data_read_page; +struct buffer_data_read_page * +ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu); +void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, + struct buffer_data_read_page *page); +int ring_buffer_read_page(struct trace_buffer *buffer, + struct buffer_data_read_page *data_page, size_t len, int cpu, int full); +void *ring_buffer_read_page_data(struct buffer_data_read_page *page); struct trace_seq; -- cgit v1.2.3 From f3e25e68ceb2abaeefcac8f930c940c4494705d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 14 Nov 2023 12:20:11 +0100 Subject: pwm: Drop unused member "pwm" from struct pwm_device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This member is only assigned to and never read. So drop it. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- include/linux/pwm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index cda3597b84f2c..8cadf9ee8d265 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -79,7 +79,6 @@ struct pwm_device { const char *label; unsigned long flags; unsigned int hwpwm; - unsigned int pwm; struct pwm_chip *chip; struct pwm_args args; -- cgit v1.2.3 From 54c86dd20bba23109e32e4e2f94ff93dd9863bc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 14 Nov 2023 12:20:12 +0100 Subject: pwm: Replace PWM chip unique base by unique ID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Traditionally each PWM device had a unique ID stored in the "pwm" member of struct pwm_device. However this number was hardly used and dropped in the previous commit. To identify a certain PWM you're supposed to use the chip's ID and the hwpwm of the PWM device now. With the PWM chip base gone PWM chips can get their IDs better and simpler using an idr. This is expected to change the numbering of PWM chips, but nothing should rely on the numbering anyhow. Other than that the side effects are: - The PWM chip IDs are smaller and in most cases consecutive. - The ordering in /sys/kernel/debug/pwm is ordered by ascending PWM chip ID. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- include/linux/pwm.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 8cadf9ee8d265..c27a4bb76012e 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -292,7 +292,7 @@ struct pwm_chip { struct device *dev; const struct pwm_ops *ops; struct module *owner; - int base; + unsigned int id; unsigned int npwm; struct pwm_device * (*of_xlate)(struct pwm_chip *chip, @@ -300,7 +300,6 @@ struct pwm_chip { unsigned int of_pwm_n_cells; /* only used internally by the PWM framework */ - struct list_head list; struct pwm_device *pwms; }; -- cgit v1.2.3 From 2d91123ae5614b8737abd3a519b81265309a1ac3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 29 Nov 2023 11:18:32 +0100 Subject: pwm: Update kernel doc for struct pwm_chip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit c572f3b9c8b7 ("pwm: Replace PWM chip unique base by unique ID") changed the members of struct pwm_chip, but failed to update the documentation accordingly. Catch up and document the new member and drop description for the two removed ones. Reported-by: Stephen Rothwell Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- include/linux/pwm.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index c27a4bb76012e..f87655c06c825 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -281,11 +281,10 @@ struct pwm_ops { * @dev: device providing the PWMs * @ops: callbacks for this PWM controller * @owner: module providing this chip - * @base: number of first PWM controlled by this chip + * @id: unique number of this PWM chip * @npwm: number of PWMs controlled by this chip * @of_xlate: request a PWM device given a device tree PWM specifier * @of_pwm_n_cells: number of cells expected in the device tree PWM specifier - * @list: list node for internal use * @pwms: array of PWM devices allocated by the framework */ struct pwm_chip { -- cgit v1.2.3 From c748a6d77c06a78651030e17da6beb278a1c9470 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 19 Dec 2023 16:30:24 +0000 Subject: pwm: Rename pwm_apply_state() to pwm_apply_might_sleep() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to introduce a pwm api which can be used from atomic context, we will need two functions for applying pwm changes: int pwm_apply_might_sleep(struct pwm *, struct pwm_state *); int pwm_apply_atomic(struct pwm *, struct pwm_state *); This commit just deals with renaming pwm_apply_state(), a following commit will introduce the pwm_apply_atomic() function. Acked-by: Uwe Kleine-König Acked-by: Guenter Roeck Acked-by: Mark Brown Acked-by: Dmitry Torokhov # for input Acked-by: Hans de Goede Acked-by: Jani Nikula Acked-by: Lee Jones Signed-off-by: Sean Young Signed-off-by: Thierry Reding --- include/linux/pwm.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index f87655c06c825..b64b8a82415c4 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -92,8 +92,8 @@ struct pwm_device { * @state: state to fill with the current PWM state * * The returned PWM state represents the state that was applied by a previous call to - * pwm_apply_state(). Drivers may have to slightly tweak that state before programming it to - * hardware. If pwm_apply_state() was never called, this returns either the current hardware + * pwm_apply_might_sleep(). Drivers may have to slightly tweak that state before programming it to + * hardware. If pwm_apply_might_sleep() was never called, this returns either the current hardware * state (if supported) or the default settings. */ static inline void pwm_get_state(const struct pwm_device *pwm, @@ -157,20 +157,20 @@ static inline void pwm_get_args(const struct pwm_device *pwm, } /** - * pwm_init_state() - prepare a new state to be applied with pwm_apply_state() + * pwm_init_state() - prepare a new state to be applied with pwm_apply_might_sleep() * @pwm: PWM device * @state: state to fill with the prepared PWM state * * This functions prepares a state that can later be tweaked and applied - * to the PWM device with pwm_apply_state(). This is a convenient function + * to the PWM device with pwm_apply_might_sleep(). This is a convenient function * that first retrieves the current PWM state and the replaces the period * and polarity fields with the reference values defined in pwm->args. * Once the function returns, you can adjust the ->enabled and ->duty_cycle - * fields according to your needs before calling pwm_apply_state(). + * fields according to your needs before calling pwm_apply_might_sleep(). * * ->duty_cycle is initially set to zero to avoid cases where the current * ->duty_cycle value exceed the pwm_args->period one, which would trigger - * an error if the user calls pwm_apply_state() without adjusting ->duty_cycle + * an error if the user calls pwm_apply_might_sleep() without adjusting ->duty_cycle * first. */ static inline void pwm_init_state(const struct pwm_device *pwm, @@ -226,7 +226,7 @@ pwm_get_relative_duty_cycle(const struct pwm_state *state, unsigned int scale) * * pwm_init_state(pwm, &state); * pwm_set_relative_duty_cycle(&state, 50, 100); - * pwm_apply_state(pwm, &state); + * pwm_apply_might_sleep(pwm, &state); * * This functions returns -EINVAL if @duty_cycle and/or @scale are * inconsistent (@scale == 0 or @duty_cycle > @scale). @@ -304,7 +304,7 @@ struct pwm_chip { #if IS_ENABLED(CONFIG_PWM) /* PWM user APIs */ -int pwm_apply_state(struct pwm_device *pwm, const struct pwm_state *state); +int pwm_apply_might_sleep(struct pwm_device *pwm, const struct pwm_state *state); int pwm_adjust_config(struct pwm_device *pwm); /** @@ -332,7 +332,7 @@ static inline int pwm_config(struct pwm_device *pwm, int duty_ns, state.duty_cycle = duty_ns; state.period = period_ns; - return pwm_apply_state(pwm, &state); + return pwm_apply_might_sleep(pwm, &state); } /** @@ -353,7 +353,7 @@ static inline int pwm_enable(struct pwm_device *pwm) return 0; state.enabled = true; - return pwm_apply_state(pwm, &state); + return pwm_apply_might_sleep(pwm, &state); } /** @@ -372,7 +372,7 @@ static inline void pwm_disable(struct pwm_device *pwm) return; state.enabled = false; - pwm_apply_state(pwm, &state); + pwm_apply_might_sleep(pwm, &state); } /* PWM provider APIs */ @@ -403,8 +403,8 @@ struct pwm_device *devm_fwnode_pwm_get(struct device *dev, struct fwnode_handle *fwnode, const char *con_id); #else -static inline int pwm_apply_state(struct pwm_device *pwm, - const struct pwm_state *state) +static inline int pwm_apply_might_sleep(struct pwm_device *pwm, + const struct pwm_state *state) { might_sleep(); return -ENOTSUPP; @@ -521,7 +521,7 @@ static inline void pwm_apply_args(struct pwm_device *pwm) state.period = pwm->args.period; state.usage_power = false; - pwm_apply_state(pwm, &state); + pwm_apply_might_sleep(pwm, &state); } struct pwm_lookup { -- cgit v1.2.3 From dc518b378dced419baa95d76a85f4c8c405722bc Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 19 Dec 2023 16:30:25 +0000 Subject: pwm: Replace ENOTSUPP with EOPNOTSUPP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to Documentation/dev-tools/checkpatch.rst ENOTSUPP is not recommended and EOPNOTSUPP should be used instead. Signed-off-by: Sean Young Acked-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- include/linux/pwm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index b64b8a82415c4..c9cb87b59ac80 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -407,12 +407,12 @@ static inline int pwm_apply_might_sleep(struct pwm_device *pwm, const struct pwm_state *state) { might_sleep(); - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline int pwm_adjust_config(struct pwm_device *pwm) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline int pwm_config(struct pwm_device *pwm, int duty_ns, -- cgit v1.2.3 From 7170d3beafc2373dd76b6b5d6e617d89e4e42b8b Mon Sep 17 00:00:00 2001 From: Sean Young Date: Tue, 19 Dec 2023 16:30:27 +0000 Subject: pwm: Make it possible to apply PWM changes in atomic context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some PWM devices require sleeping, for example if the pwm device is connected over I2C. However, many PWM devices could be used from atomic context, e.g. memory mapped PWM. This is useful for, for example, the pwm-ir-tx driver which requires precise timing. Sleeping causes havoc with the generated IR signal. Since not all PWM devices can support atomic context, we also add a pwm_might_sleep() function to check if is not supported. Signed-off-by: Sean Young Reviewed-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- include/linux/pwm.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index c9cb87b59ac80..495af3627939c 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -285,6 +285,7 @@ struct pwm_ops { * @npwm: number of PWMs controlled by this chip * @of_xlate: request a PWM device given a device tree PWM specifier * @of_pwm_n_cells: number of cells expected in the device tree PWM specifier + * @atomic: can the driver's ->apply() be called in atomic context * @pwms: array of PWM devices allocated by the framework */ struct pwm_chip { @@ -297,6 +298,7 @@ struct pwm_chip { struct pwm_device * (*of_xlate)(struct pwm_chip *chip, const struct of_phandle_args *args); unsigned int of_pwm_n_cells; + bool atomic; /* only used internally by the PWM framework */ struct pwm_device *pwms; @@ -305,6 +307,7 @@ struct pwm_chip { #if IS_ENABLED(CONFIG_PWM) /* PWM user APIs */ int pwm_apply_might_sleep(struct pwm_device *pwm, const struct pwm_state *state); +int pwm_apply_atomic(struct pwm_device *pwm, const struct pwm_state *state); int pwm_adjust_config(struct pwm_device *pwm); /** @@ -375,6 +378,17 @@ static inline void pwm_disable(struct pwm_device *pwm) pwm_apply_might_sleep(pwm, &state); } +/** + * pwm_might_sleep() - is pwm_apply_atomic() supported? + * @pwm: PWM device + * + * Returns: false if pwm_apply_atomic() can be called from atomic context. + */ +static inline bool pwm_might_sleep(struct pwm_device *pwm) +{ + return !pwm->chip->atomic; +} + /* PWM provider APIs */ int pwm_capture(struct pwm_device *pwm, struct pwm_capture *result, unsigned long timeout); @@ -403,6 +417,11 @@ struct pwm_device *devm_fwnode_pwm_get(struct device *dev, struct fwnode_handle *fwnode, const char *con_id); #else +static inline bool pwm_might_sleep(struct pwm_device *pwm) +{ + return true; +} + static inline int pwm_apply_might_sleep(struct pwm_device *pwm, const struct pwm_state *state) { @@ -410,6 +429,12 @@ static inline int pwm_apply_might_sleep(struct pwm_device *pwm, return -EOPNOTSUPP; } +static inline int pwm_apply_atomic(struct pwm_device *pwm, + const struct pwm_state *state) +{ + return -EOPNOTSUPP; +} + static inline int pwm_adjust_config(struct pwm_device *pwm) { return -EOPNOTSUPP; -- cgit v1.2.3 From fe22944cf05ede8e6f841cfecdb7093a53a3d9b3 Mon Sep 17 00:00:00 2001 From: xiaoming Wang Date: Tue, 19 Dec 2023 11:34:11 +0800 Subject: cpu/hotplug: Increase the number of dynamic states The dynamically allocatable hotplug state space can be exhausted by the existing drivers and infrastructure which install CPU hotplug states dynamically. That prevents new drivers and infrastructure from installing dynamically allocated states. Increase the size of the CPUHP_AP_ONLINE_DYN state by 10 to make room. Signed-off-by: Xiaoming Wang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20231219033411.816100-1-xiaoming.wang@intel.com --- include/linux/cpuhotplug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index af6c21aab9859..8bd454dfe453c 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -239,7 +239,7 @@ enum cpuhp_state { CPUHP_AP_RCUTREE_ONLINE, CPUHP_AP_BASE_CACHEINFO_ONLINE, CPUHP_AP_ONLINE_DYN, - CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, + CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 40, CPUHP_AP_X86_HPET_ONLINE, CPUHP_AP_X86_KVM_CLK_ONLINE, CPUHP_AP_ACTIVE, -- cgit v1.2.3 From 80e4a9987999e682366d60f43a7b2adefc48e222 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sun, 10 Dec 2023 00:00:47 +0100 Subject: pwm: Drop two unused API functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These functions are unused. Also I think there is no valid use case where these are correct to be called. So drop them. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- include/linux/pwm.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 495af3627939c..5dd665d8c909e 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -111,12 +111,6 @@ static inline bool pwm_is_enabled(const struct pwm_device *pwm) return state.enabled; } -static inline void pwm_set_period(struct pwm_device *pwm, u64 period) -{ - if (pwm) - pwm->state.period = period; -} - static inline u64 pwm_get_period(const struct pwm_device *pwm) { struct pwm_state state; @@ -126,12 +120,6 @@ static inline u64 pwm_get_period(const struct pwm_device *pwm) return state.period; } -static inline void pwm_set_duty_cycle(struct pwm_device *pwm, unsigned int duty) -{ - if (pwm) - pwm->state.duty_cycle = duty; -} - static inline u64 pwm_get_duty_cycle(const struct pwm_device *pwm) { struct pwm_state state; -- cgit v1.2.3 From eba2591d99d1f14a04c8a8a845ab0795b93f5646 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 13 Dec 2023 21:29:59 +0100 Subject: mm: Introduce pudp/p4dp/pgdp_get() functions Instead of directly dereferencing page tables entries, which can cause issues (see commit 20a004e7b017 ("arm64: mm: Use READ_ONCE/WRITE_ONCE when accessing page tables"), let's introduce new functions to get the pud/p4d/pgd entries (the pte and pmd versions already exist). Note that arm pgd_t is actually an array so pgdp_get() is defined as a macro to avoid a build error. Those new functions will be used in subsequent commits by the riscv architecture. Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20231213203001.179237-3-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- include/linux/pgtable.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index af7639c3b0a3a..8b7daccd11bef 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -292,6 +292,27 @@ static inline pmd_t pmdp_get(pmd_t *pmdp) } #endif +#ifndef pudp_get +static inline pud_t pudp_get(pud_t *pudp) +{ + return READ_ONCE(*pudp); +} +#endif + +#ifndef p4dp_get +static inline p4d_t p4dp_get(p4d_t *p4dp) +{ + return READ_ONCE(*p4dp); +} +#endif + +#ifndef pgdp_get +static inline pgd_t pgdp_get(pgd_t *pgdp) +{ + return READ_ONCE(*pgdp); +} +#endif + #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, -- cgit v1.2.3 From 7d7ef0a4686abe43cd76a141b340a348f45ecdf2 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 29 Nov 2023 03:21:53 +0000 Subject: mm: memcg: restore subtree stats flushing Stats flushing for memcg currently follows the following rules: - Always flush the entire memcg hierarchy (i.e. flush the root). - Only one flusher is allowed at a time. If someone else tries to flush concurrently, they skip and return immediately. - A periodic flusher flushes all the stats every 2 seconds. The reason this approach is followed is because all flushes are serialized by a global rstat spinlock. On the memcg side, flushing is invoked from userspace reads as well as in-kernel flushers (e.g. reclaim, refault, etc). This approach aims to avoid serializing all flushers on the global lock, which can cause a significant performance hit under high concurrency. This approach has the following problems: - Occasionally a userspace read of the stats of a non-root cgroup will be too expensive as it has to flush the entire hierarchy [1]. - Sometimes the stats accuracy are compromised if there is an ongoing flush, and we skip and return before the subtree of interest is actually flushed, yielding stale stats (by up to 2s due to periodic flushing). This is more visible when reading stats from userspace, but can also affect in-kernel flushers. The latter problem is particulary a concern when userspace reads stats after an event occurs, but gets stats from before the event. Examples: - When memory usage / pressure spikes, a userspace OOM handler may look at the stats of different memcgs to select a victim based on various heuristics (e.g. how much private memory will be freed by killing this). Reading stale stats from before the usage spike in this case may cause a wrongful OOM kill. - A proactive reclaimer may read the stats after writing to memory.reclaim to measure the success of the reclaim operation. Stale stats from before reclaim may give a false negative. - Reading the stats of a parent and a child memcg may be inconsistent (child larger than parent), if the flush doesn't happen when the parent is read, but happens when the child is read. As for in-kernel flushers, they will occasionally get stale stats. No regressions are currently known from this, but if there are regressions, they would be very difficult to debug and link to the source of the problem. This patch aims to fix these problems by restoring subtree flushing, and removing the unified/coalesced flushing logic that skips flushing if there is an ongoing flush. This change would introduce a significant regression with global stats flushing thresholds. With per-memcg stats flushing thresholds, this seems to perform really well. The thresholds protect the underlying lock from unnecessary contention. This patch was tested in two ways to ensure the latency of flushing is up to par, on a machine with 384 cpus: - A synthetic test with 5000 concurrent workers in 500 cgroups doing allocations and reclaim, as well as 1000 readers for memory.stat (variation of [2]). No regressions were noticed in the total runtime. Note that significant regressions in this test are observed with global stats thresholds, but not with per-memcg thresholds. - A synthetic stress test for concurrently reading memcg stats while memory allocation/freeing workers are running in the background, provided by Wei Xu [3]. With 250k threads reading the stats every 100ms in 50k cgroups, 99.9% of reads take <= 50us. Less than 0.01% of reads take more than 1ms, and no reads take more than 100ms. [1] https://lore.kernel.org/lkml/CABWYdi0c6__rh-K7dcM_pkf9BJdTRtAU08M43KO9ME4-dsgfoQ@mail.gmail.com/ [2] https://lore.kernel.org/lkml/CAJD7tka13M-zVZTyQJYL1iUAYvuQ1fcHbCjcOBZcz6POYTV-4g@mail.gmail.com/ [3] https://lore.kernel.org/lkml/CAAPL-u9D2b=iF5Lf_cRnKxUfkiEe0AMDTu6yhrUAzX0b6a6rDg@mail.gmail.com/ [akpm@linux-foundation.org: fix mm/zswap.c] [yosryahmed@google.com: remove stats flushing mutex] Link: https://lkml.kernel.org/r/CAJD7tkZgP3m-VVPn+fF_YuvXeQYK=tZZjJHj=dzD=CcSSpp2qg@mail.gmail.com Link: https://lkml.kernel.org/r/20231129032154.3710765-6-yosryahmed@google.com Signed-off-by: Yosry Ahmed Tested-by: Domenico Cerasuolo Acked-by: Shakeel Butt Cc: Chris Li Cc: Greg Thelen Cc: Ivan Babrou Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutny Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun Heo Cc: Waiman Long Cc: Wei Xu Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a308c8eacf20d..43b77363ab8e7 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1051,8 +1051,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return x; } -void mem_cgroup_flush_stats(void); -void mem_cgroup_flush_stats_ratelimited(void); +void mem_cgroup_flush_stats(struct mem_cgroup *memcg); +void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg); void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); @@ -1563,11 +1563,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, return node_page_state(lruvec_pgdat(lruvec), idx); } -static inline void mem_cgroup_flush_stats(void) +static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg) { } -static inline void mem_cgroup_flush_stats_ratelimited(void) +static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) { } -- cgit v1.2.3 From 3485b88390b0af9e05dc2c3f57e9936f41e159a0 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:04 +0000 Subject: mm: thp: introduce multi-size THP sysfs interface In preparation for adding support for anonymous multi-size THP, introduce new sysfs structure that will be used to control the new behaviours. A new directory is added under transparent_hugepage for each supported THP size, and contains an `enabled` file, which can be set to "inherit" (to inherit the global setting), "always", "madvise" or "never". For now, the kernel still only supports PMD-sized anonymous THP, so only 1 directory is populated. The first half of the change converts transhuge_vma_suitable() and hugepage_vma_check() so that they take a bitfield of orders for which the user wants to determine support, and the functions filter out all the orders that can't be supported, given the current sysfs configuration and the VMA dimensions. The resulting functions are renamed to thp_vma_suitable_orders() and thp_vma_allowable_orders() respectively. Convenience functions that take a single, unencoded order and return a boolean are also defined as thp_vma_suitable_order() and thp_vma_allowable_order(). The second half of the change implements the new sysfs interface. It has been done so that each supported THP size has a `struct thpsize`, which describes the relevant metadata and is itself a kobject. This is pretty minimal for now, but should make it easy to add new per-thpsize files to the interface if needed in future (e.g. per-size defrag). Rather than keep the `enabled` state directly in the struct thpsize, I've elected to directly encode it into huge_anon_orders_[always|madvise|inherit] bitfields since this reduces the amount of work required in thp_vma_allowable_orders() which is called for every page fault. See Documentation/admin-guide/mm/transhuge.rst, as modified by this commit, for details of how the new sysfs interface works. [ryan.roberts@arm.com: fix build warning when CONFIG_SYSFS is disabled] Link: https://lkml.kernel.org/r/20231211125320.3997543-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20231207161211.2374093-4-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: Barry Song Tested-by: Kefeng Wang Tested-by: John Hubbard Acked-by: David Hildenbrand Cc: Alistair Popple Cc: Anshuman Khandual Cc: Catalin Marinas Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Yin Fengwei Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 181 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 155 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index fa0350b0812ab..609c153bae57d 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -67,6 +67,24 @@ extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<vm_start >> PAGE_SHIFT) - vma->vm_pgoff, - HPAGE_PMD_NR)) + hpage_size >> PAGE_SHIFT)) return false; } - haddr = addr & HPAGE_PMD_MASK; + haddr = ALIGN_DOWN(addr, hpage_size); - if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) + if (haddr < vma->vm_start || haddr + hpage_size > vma->vm_end) return false; return true; } +/* + * Filter the bitfield of input orders to the ones suitable for use in the vma. + * See thp_vma_suitable_order(). + * All orders that pass the checks are returned as a bitfield. + */ +static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, + unsigned long addr, unsigned long orders) +{ + int order; + + /* + * Iterate over orders, highest to lowest, removing orders that don't + * meet alignment requirements from the set. Exit loop at first order + * that meets requirements, since all lower orders must also meet + * requirements. + */ + + order = highest_order(orders); + + while (orders) { + if (thp_vma_suitable_order(vma, addr, order)) + break; + order = next_order(&orders, order); + } + + return orders; +} + static inline bool file_thp_enabled(struct vm_area_struct *vma) { struct inode *inode; @@ -130,8 +208,52 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } -bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, - bool smaps, bool in_pf, bool enforce_sysfs); +unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs, + unsigned long orders); + +/** + * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma + * @vma: the vm area to check + * @vm_flags: use these vm_flags instead of vma->vm_flags + * @smaps: whether answer will be used for smaps file + * @in_pf: whether answer will be used by page fault handler + * @enforce_sysfs: whether sysfs config should be taken into account + * @orders: bitfield of all orders to consider + * + * Calculates the intersection of the requested hugepage orders and the allowed + * hugepage orders for the provided vma. Permitted orders are encoded as a set + * bit at the corresponding bit position (bit-2 corresponds to order-2, bit-3 + * corresponds to order-3, etc). Order-0 is never considered a hugepage order. + * + * Return: bitfield of orders allowed for hugepage in the vma. 0 if no hugepage + * orders are allowed. + */ +static inline +unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs, + unsigned long orders) +{ + /* Optimization to check if required orders are enabled early. */ + if (enforce_sysfs && vma_is_anonymous(vma)) { + unsigned long mask = READ_ONCE(huge_anon_orders_always); + + if (vm_flags & VM_HUGEPAGE) + mask |= READ_ONCE(huge_anon_orders_madvise); + if (hugepage_global_always() || + ((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled())) + mask |= READ_ONCE(huge_anon_orders_inherit); + + orders &= mask; + if (!orders) + return 0; + } + + return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, + enforce_sysfs, orders); +} #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ @@ -267,17 +389,24 @@ static inline bool folio_test_pmd_mappable(struct folio *folio) return false; } -static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, - unsigned long addr) +static inline bool thp_vma_suitable_order(struct vm_area_struct *vma, + unsigned long addr, int order) { return false; } -static inline bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs) +static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, + unsigned long addr, unsigned long orders) { - return false; + return 0; +} + +static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs, + unsigned long orders) +{ + return 0; } static inline void folio_prep_large_rmappable(struct folio *folio) {} -- cgit v1.2.3 From 19eaf44954df64f9bc8dec398219e15ad0811497 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 7 Dec 2023 16:12:05 +0000 Subject: mm: thp: support allocation of anonymous multi-size THP Introduce the logic to allow THP to be configured (through the new sysfs interface we just added) to allocate large folios to back anonymous memory, which are larger than the base page size but smaller than PMD-size. We call this new THP extension "multi-size THP" (mTHP). mTHP continues to be PTE-mapped, but in many cases can still provide similar benefits to traditional PMD-sized THP: Page faults are significantly reduced (by a factor of e.g. 4, 8, 16, etc. depending on the configured order), but latency spikes are much less prominent because the size of each page isn't as huge as the PMD-sized variant and there is less memory to clear in each page fault. The number of per-page operations (e.g. ref counting, rmap management, lru list management) are also significantly reduced since those ops now become per-folio. Some architectures also employ TLB compression mechanisms to squeeze more entries in when a set of PTEs are virtually and physically contiguous and approporiately aligned. In this case, TLB misses will occur less often. The new behaviour is disabled by default, but can be enabled at runtime by writing to /sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled (see documentation in previous commit). The long term aim is to change the default to include suitable lower orders, but there are some risks around internal fragmentation that need to be better understood first. [ryan.roberts@arm.com: resolve some multi-size THP review nits] Link: https://lkml.kernel.org/r/20231214160251.3574571-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20231207161211.2374093-5-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Tested-by: Kefeng Wang Tested-by: John Hubbard Acked-by: David Hildenbrand Cc: Alistair Popple Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: David Rientjes Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Itaru Kitayama Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: Yang Shi Cc: Yin Fengwei Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 609c153bae57d..fa7a38a30fc68 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -68,9 +68,11 @@ extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_NR (1< Date: Wed, 13 Dec 2023 19:03:33 +0000 Subject: mm/damon: update email of SeongJae Patch series "mm/damon: misc updates for 6.8". Update comments, tests, and documents for DAMON. This patch (of 6): SeongJae is using his kernel.org account for DAMON development. Update the old email addresses on the comments of DAMON source files. Link: https://lkml.kernel.org/r/20231213190338.54146-1-sj@kernel.org Link: https://lkml.kernel.org/r/20231213190338.54146-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 12510d8c51c62..5881e4ac30be6 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -2,7 +2,7 @@ /* * DAMON api * - * Author: SeongJae Park + * Author: SeongJae Park */ #ifndef _DAMON_H_ -- cgit v1.2.3 From 0abfa8efad8dccc3899f64dafa985a251714a709 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 12 Dec 2023 20:33:16 -0800 Subject: gfp: gfp_types.h: fix typos & punctuation Correct typos/spellos and punctutation. Link: https://lkml.kernel.org/r/20231213043316.10128-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/gfp_types.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index ae994534a12aa..1b6053da8754e 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -162,25 +162,25 @@ typedef unsigned int __bitwise gfp_t; * %__GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim. * * The default allocator behavior depends on the request size. We have a concept - * of so called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER). + * of so-called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER). * !costly allocations are too essential to fail so they are implicitly * non-failing by default (with some exceptions like OOM victims might fail so * the caller still has to check for failures) while costly requests try to be * not disruptive and back off even without invoking the OOM killer. * The following three modifiers might be used to override some of these - * implicit rules + * implicit rules. * * %__GFP_NORETRY: The VM implementation will try only very lightweight * memory direct reclaim to get some memory under memory pressure (thus * it can sleep). It will avoid disruptive actions like OOM killer. The * caller must handle the failure which is quite likely to happen under * heavy memory pressure. The flag is suitable when failure can easily be - * handled at small cost, such as reduced throughput + * handled at small cost, such as reduced throughput. * * %__GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim * procedures that have previously failed if there is some indication - * that progress has been made else where. It can wait for other - * tasks to attempt high level approaches to freeing memory such as + * that progress has been made elsewhere. It can wait for other + * tasks to attempt high-level approaches to freeing memory such as * compaction (which removes fragmentation) and page-out. * There is still a definite limit to the number of retries, but it is * a larger limit than with %__GFP_NORETRY. @@ -230,7 +230,7 @@ typedef unsigned int __bitwise gfp_t; * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting * memory tags at the same time as zeroing memory has minimal additional - * performace impact. + * performance impact. * * %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation. * Used for userspace and vmalloc pages; the latter are unpoisoned by -- cgit v1.2.3 From cbc2fe9d9cb226347365753f50d81bc48cc3c52e Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 13 Dec 2023 13:57:41 +0800 Subject: kexec_file: add kexec_file flag to control debug printing Patch series "kexec_file: print out debugging message if required", v4. Currently, specifying '-d' on kexec command will print a lot of debugging informationabout kexec/kdump loading with kexec_load interface. However, kexec_file_load prints nothing even though '-d' is specified. It's very inconvenient to debug or analyze the kexec/kdump loading when something wrong happened with kexec/kdump itself or develper want to check the kexec/kdump loading. In this patchset, a kexec_file flag is KEXEC_FILE_DEBUG added and checked in code. If it's passed in, debugging message of kexec_file code will be printed out and can be seen from console and dmesg. Otherwise, the debugging message is printed like beofre when pr_debug() is taken. Note: **** ===== 1) The code in kexec-tools utility also need be changed to support passing KEXEC_FILE_DEBUG to kernel when 'kexec -s -d' is specified. The patch link is here: ========= [PATCH] kexec_file: add kexec_file flag to support debug printing http://lists.infradead.org/pipermail/kexec/2023-November/028505.html 2) s390 also has kexec_file code, while I am not sure what debugging information is necessary. So leave it to s390 developer. Test: **** ==== Testing was done in v1 on x86_64 and arm64. For v4, tested on x86_64 again. And on x86_64, the printed messages look like below: -------------------------------------------------------------- kexec measurement buffer for the loaded kernel at 0x207fffe000. Loaded purgatory at 0x207fff9000 Loaded boot_param, command line and misc at 0x207fff3000 bufsz=0x1180 memsz=0x1180 Loaded 64bit kernel at 0x207c000000 bufsz=0xc88200 memsz=0x3c4a000 Loaded initrd at 0x2079e79000 bufsz=0x2186280 memsz=0x2186280 Final command line is: root=/dev/mapper/fedora_intel--knightslanding--lb--02-root ro rd.lvm.lv=fedora_intel-knightslanding-lb-02/root console=ttyS0,115200N81 crashkernel=256M E820 memmap: 0000000000000000-000000000009a3ff (1) 000000000009a400-000000000009ffff (2) 00000000000e0000-00000000000fffff (2) 0000000000100000-000000006ff83fff (1) 000000006ff84000-000000007ac50fff (2) ...... 000000207fff6150-000000207fff615f (128) 000000207fff6160-000000207fff714f (1) 000000207fff7150-000000207fff715f (128) 000000207fff7160-000000207fff814f (1) 000000207fff8150-000000207fff815f (128) 000000207fff8160-000000207fffffff (1) nr_segments = 5 segment[0]: buf=0x000000004e5ece74 bufsz=0x211 mem=0x207fffe000 memsz=0x1000 segment[1]: buf=0x000000009e871498 bufsz=0x4000 mem=0x207fff9000 memsz=0x5000 segment[2]: buf=0x00000000d879f1fe bufsz=0x1180 mem=0x207fff3000 memsz=0x2000 segment[3]: buf=0x000000001101cd86 bufsz=0xc88200 mem=0x207c000000 memsz=0x3c4a000 segment[4]: buf=0x00000000c6e38ac7 bufsz=0x2186280 mem=0x2079e79000 memsz=0x2187000 kexec_file_load: type:0, start:0x207fff91a0 head:0x109e004002 flags:0x8 --------------------------------------------------------------------------- This patch (of 7): When specifying 'kexec -c -d', kexec_load interface will print loading information, e.g the regions where kernel/initrd/purgatory/cmdline are put, the memmap passed to 2nd kernel taken as system RAM ranges, and printing all contents of struct kexec_segment, etc. These are very helpful for analyzing or positioning what's happening when kexec/kdump itself failed. The debugging printing for kexec_load interface is made in user space utility kexec-tools. Whereas, with kexec_file_load interface, 'kexec -s -d' print nothing. Because kexec_file code is mostly implemented in kernel space, and the debugging printing functionality is missed. It's not convenient when debugging kexec/kdump loading and jumping with kexec_file_load interface. Now add KEXEC_FILE_DEBUG to kexec_file flag to control the debugging message printing. And add global variable kexec_file_dbg_print and macro kexec_dprintk() to facilitate the printing. This is a preparation, later kexec_dprintk() will be used to replace the existing pr_debug(). Once 'kexec -s -d' is specified, it will print out kexec/kdump loading information. If '-d' is not specified, it regresses to pr_debug(). Link: https://lkml.kernel.org/r/20231213055747.61826-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20231213055747.61826-2-bhe@redhat.com Signed-off-by: Baoquan He Cc: Conor Dooley Cc: Joe Perches Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- include/linux/kexec.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 8227455192b73..400cb6c02176e 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -403,7 +403,7 @@ bool kexec_load_permitted(int kexec_image_type); /* List of defined/legal kexec file flags */ #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ - KEXEC_FILE_NO_INITRAMFS) + KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG) /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; @@ -500,6 +500,13 @@ static inline int crash_hotplug_memory_support(void) { return 0; } static inline unsigned int crash_get_elfcorehdr_size(void) { return 0; } #endif +extern bool kexec_file_dbg_print; + +#define kexec_dprintk(fmt, ...) \ + printk("%s" fmt, \ + kexec_file_dbg_print ? KERN_INFO : KERN_DEBUG, \ + ##__VA_ARGS__) + #else /* !CONFIG_KEXEC_CORE */ struct pt_regs; struct task_struct; -- cgit v1.2.3 From f6120d527b8611aeaa1a34a33337f530d78a789c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 15 Dec 2023 19:56:59 -0500 Subject: task_stack.h: add missing include Signed-off-by: Kent Overstreet --- include/linux/sched/task_stack.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index f158b025c1750..ccd72b978e1fc 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -8,6 +8,7 @@ #include #include +#include #ifdef CONFIG_THREAD_INFO_IN_TASK -- cgit v1.2.3 From 6a2623b17634688cfec58dd44041c5db2143719d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 15 Dec 2023 20:00:15 -0500 Subject: nsproxy.h: add missing include Signed-off-by: Kent Overstreet --- include/linux/nsproxy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 771cb02858724..5601d14e28869 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -2,6 +2,7 @@ #ifndef _LINUX_NSPROXY_H #define _LINUX_NSPROXY_H +#include #include #include -- cgit v1.2.3 From 316aa04d1ffa61f73ce2679d1ae1dca8747aeb1f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 14:00:59 -0500 Subject: kmsan: add missing types.h dependency more header dependency pruning/fixing Signed-off-by: Kent Overstreet --- include/linux/kmsan_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kmsan_types.h b/include/linux/kmsan_types.h index 8bfa6c98176d4..929287981afe4 100644 --- a/include/linux/kmsan_types.h +++ b/include/linux/kmsan_types.h @@ -9,6 +9,8 @@ #ifndef _LINUX_KMSAN_TYPES_H #define _LINUX_KMSAN_TYPES_H +#include + /* These constants are defined in the MSan LLVM instrumentation pass. */ #define KMSAN_RETVAL_SIZE 800 #define KMSAN_PARAM_SIZE 800 -- cgit v1.2.3 From 058e0529d12ae9fed34c6eeef700cd294f9622a1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 15 Dec 2023 20:22:22 -0500 Subject: time_namespace.h: fix missing include Signed-off-by: Kent Overstreet --- include/linux/time_namespace.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 03d9c5ac01d14..5258d81cef179 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -7,6 +7,7 @@ #include #include #include +#include struct user_namespace; extern struct user_namespace init_user_ns; -- cgit v1.2.3 From ea115c248a478ce1acbf4776e4666fb663285b2f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 17 Dec 2023 20:01:01 -0500 Subject: torture: add missing dependency on hrtimer.h Signed-off-by: Kent Overstreet --- include/linux/torture.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/torture.h b/include/linux/torture.h index c98d0c83d117c..1541454da03e8 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -21,6 +21,7 @@ #include #include #include +#include /* Definitions for a non-string torture-test module parameter. */ #define torture_param(type, name, init, msg) \ -- cgit v1.2.3 From bea32141764bc76db2d75c9484b71ded56119ab4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 6 Dec 2022 14:32:23 -0500 Subject: nodemask: Split out include/linux/nodemask_types.h sched.h, which defines task_struct, needs nodemask_t - but sched.h is a frequently used header and ideally shouldn't be pulling in any more code that it needs to. This splits out nodemask_types.h which has the definition sched.h needs, which will avoid a circular header dependency in the alloc tagging patch series, and as a bonus should speed up kernel build times. Signed-off-by: Kent Overstreet Signed-off-by: Suren Baghdasaryan Cc: Ingo Molnar Cc: Peter Zijlstra --- include/linux/nodemask.h | 2 +- include/linux/nodemask_types.h | 10 ++++++++++ include/linux/sched.h | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 include/linux/nodemask_types.h (limited to 'include/linux') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 8d07116caaf1b..b61438313a731 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -93,10 +93,10 @@ #include #include #include +#include #include #include -typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; extern nodemask_t _unused_nodemask_arg_; /** diff --git a/include/linux/nodemask_types.h b/include/linux/nodemask_types.h new file mode 100644 index 0000000000000..6b28d97ea6ed0 --- /dev/null +++ b/include/linux/nodemask_types.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_NODEMASK_TYPES_H +#define __LINUX_NODEMASK_TYPES_H + +#include +#include + +typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; + +#endif /* __LINUX_NODEMASK_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 292c316972485..5a5b7b1226826 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3 From d9f29deb7fe8137fd1954871443cbbc1b6125832 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Jan 2023 21:32:18 -0500 Subject: prandom: Remove unused include prandom.h doesn't use percpu.h - this fixes some circular header issues. Signed-off-by: Kent Overstreet Signed-off-by: Suren Baghdasaryan --- include/linux/prandom.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/prandom.h b/include/linux/prandom.h index f2ed5b72b3d6f..f7f1e5251c679 100644 --- a/include/linux/prandom.h +++ b/include/linux/prandom.h @@ -10,7 +10,6 @@ #include #include -#include #include struct rnd_state { -- cgit v1.2.3 From 6060ef31f1162fb91a1688fa5098b38c4b9c680c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 2 Dec 2022 14:39:55 -0500 Subject: timekeeping: Kill percpu.h dependency Slimming down recursive header includes. Signed-off-by: Kent Overstreet Cc: Thomas Gleixner --- include/linux/hrtimer.h | 2 +- include/linux/time_namespace.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index f2044d5a652b5..02d264ca9dce4 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 5258d81cef179..876e31b4461d0 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -12,6 +12,8 @@ struct user_namespace; extern struct user_namespace init_user_ns; +struct vm_area_struct; + struct timens_offsets { struct timespec64 monotonic; struct timespec64 boottime; -- cgit v1.2.3 From d7a73e3f089204aee3393687e23fd45a22657b08 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 13:27:00 -0500 Subject: kernel/numa.c: Move logging out of numa.h Moving these stub functions to a .c file means we can kill a sched.h dependency on printk.h. Signed-off-by: Kent Overstreet --- include/linux/numa.h | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/numa.h b/include/linux/numa.h index a904861de8000..915033a757315 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NUMA_H #define _LINUX_NUMA_H +#include #include #ifdef CONFIG_NODES_SHIFT @@ -22,34 +23,26 @@ #endif #ifdef CONFIG_NUMA -#include #include /* Generic implementation available */ int numa_nearest_node(int node, unsigned int state); #ifndef memory_add_physaddr_to_nid -static inline int memory_add_physaddr_to_nid(u64 start) -{ - pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n", - start); - return 0; -} +int memory_add_physaddr_to_nid(u64 start); #endif + #ifndef phys_to_target_node -static inline int phys_to_target_node(u64 start) -{ - pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", - start); - return 0; -} +int phys_to_target_node(u64 start); #endif + #ifndef numa_fill_memblks static inline int __init numa_fill_memblks(u64 start, u64 end) { return NUMA_NO_MEMBLK; } #endif + #else /* !CONFIG_NUMA */ static inline int numa_nearest_node(int node, unsigned int state) { -- cgit v1.2.3 From d1d71b30e1f85e8b5d7c0d8edc16869bdc4d535f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 14:05:04 -0500 Subject: sched.h: Move (spin|rwlock)_needbreak() to spinlock.h This lets us kill the dependency on spinlock.h. Signed-off-by: Kent Overstreet --- include/linux/sched.h | 31 ------------------------------- include/linux/spinlock.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5a5b7b1226826..7501a3451a201 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2227,37 +2227,6 @@ static inline bool preempt_model_preemptible(void) return preempt_model_full() || preempt_model_rt(); } -/* - * Does a critical section need to be broken due to another - * task waiting?: (technically does not depend on CONFIG_PREEMPTION, - * but a general need for low latency) - */ -static inline int spin_needbreak(spinlock_t *lock) -{ -#ifdef CONFIG_PREEMPTION - return spin_is_contended(lock); -#else - return 0; -#endif -} - -/* - * Check if a rwlock is contended. - * Returns non-zero if there is another task waiting on the rwlock. - * Returns zero if the lock is not contended or the system / underlying - * rwlock implementation does not support contention detection. - * Technically does not depend on CONFIG_PREEMPTION, but a general need - * for low latency. - */ -static inline int rwlock_needbreak(rwlock_t *lock) -{ -#ifdef CONFIG_PREEMPTION - return rwlock_is_contended(lock); -#else - return 0; -#endif -} - static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 31d3d747a9db7..0c71f06454d9e 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -449,6 +449,37 @@ static __always_inline int spin_is_contended(spinlock_t *lock) return raw_spin_is_contended(&lock->rlock); } +/* + * Does a critical section need to be broken due to another + * task waiting?: (technically does not depend on CONFIG_PREEMPTION, + * but a general need for low latency) + */ +static inline int spin_needbreak(spinlock_t *lock) +{ +#ifdef CONFIG_PREEMPTION + return spin_is_contended(lock); +#else + return 0; +#endif +} + +/* + * Check if a rwlock is contended. + * Returns non-zero if there is another task waiting on the rwlock. + * Returns zero if the lock is not contended or the system / underlying + * rwlock implementation does not support contention detection. + * Technically does not depend on CONFIG_PREEMPTION, but a general need + * for low latency. + */ +static inline int rwlock_needbreak(rwlock_t *lock) +{ +#ifdef CONFIG_PREEMPTION + return rwlock_is_contended(lock); +#else + return 0; +#endif +} + #define assert_spin_locked(lock) assert_raw_spin_locked(&(lock)->rlock) #else /* !CONFIG_PREEMPT_RT */ -- cgit v1.2.3 From 2e346b19aab9ee40e5e429667a0a515f1d68b714 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 14:11:05 -0500 Subject: ktime.h: move ktime_t to types.h ktime.h pulls in quite a few headers recursively (including printk.h) - this is going to help with trimming sched.h dependencies. Signed-off-by: Kent Overstreet --- include/linux/ktime.h | 8 +++----- include/linux/types.h | 3 +++ 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 73f20deb497d5..3a4e723eae0f1 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -21,12 +21,10 @@ #ifndef _LINUX_KTIME_H #define _LINUX_KTIME_H -#include -#include #include - -/* Nanosecond scalar representation for kernel time values */ -typedef s64 ktime_t; +#include +#include +#include /** * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value diff --git a/include/linux/types.h b/include/linux/types.h index 253168bb3fe15..2bc8766ba20ca 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -120,6 +120,9 @@ typedef s64 int64_t; #define aligned_be64 __aligned_be64 #define aligned_le64 __aligned_le64 +/* Nanosecond scalar representation for kernel time values */ +typedef s64 ktime_t; + /** * The type used for indexing onto a disc or disc partition. * -- cgit v1.2.3 From 50d91c76582513852e38eb80491f54d44cfb51fc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 11:52:02 -0500 Subject: hrtimers: Split out hrtimer_types.h We need to reduce the scope of what's included in sched.h: task_struct includes a hrtimer, so split out the core types into their own header. Signed-off-by: Kent Overstreet Cc: Thomas Gleixner --- include/linux/hrtimer.h | 44 ++----------------------------------- include/linux/hrtimer_types.h | 50 +++++++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 2 +- 3 files changed, 53 insertions(+), 43 deletions(-) create mode 100644 include/linux/hrtimer_types.h (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 02d264ca9dce4..87e3bedf8eb00 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -13,13 +13,13 @@ #define _LINUX_HRTIMER_H #include -#include +#include #include #include #include +#include #include #include -#include struct hrtimer_clock_base; struct hrtimer_cpu_base; @@ -59,14 +59,6 @@ enum hrtimer_mode { HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD, }; -/* - * Return values for the callback function - */ -enum hrtimer_restart { - HRTIMER_NORESTART, /* Timer is not restarted */ - HRTIMER_RESTART, /* Timer must be restarted */ -}; - /* * Values to track state of the timer * @@ -94,38 +86,6 @@ enum hrtimer_restart { #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 -/** - * struct hrtimer - the basic hrtimer structure - * @node: timerqueue node, which also manages node.expires, - * the absolute expiry time in the hrtimers internal - * representation. The time is related to the clock on - * which the timer is based. Is setup by adding - * slack to the _softexpires value. For non range timers - * identical to _softexpires. - * @_softexpires: the absolute earliest expiry time of the hrtimer. - * The time which was given as expiry time when the timer - * was armed. - * @function: timer expiry callback function - * @base: pointer to the timer base (per cpu and per clock) - * @state: state information (See bit values above) - * @is_rel: Set if the timer was armed relative - * @is_soft: Set if hrtimer will be expired in soft interrupt context. - * @is_hard: Set if hrtimer will be expired in hard interrupt context - * even on RT. - * - * The hrtimer structure must be initialized by hrtimer_init() - */ -struct hrtimer { - struct timerqueue_node node; - ktime_t _softexpires; - enum hrtimer_restart (*function)(struct hrtimer *); - struct hrtimer_clock_base *base; - u8 state; - u8 is_rel; - u8 is_soft; - u8 is_hard; -}; - /** * struct hrtimer_sleeper - simple sleeper structure * @timer: embedded timer structure diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h new file mode 100644 index 0000000000000..f4ef391b96a7a --- /dev/null +++ b/include/linux/hrtimer_types.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_HRTIMER_TYPES_H +#define _LINUX_HRTIMER_TYPES_H + +#include +#include + +struct hrtimer_clock_base; + +/* + * Return values for the callback function + */ +enum hrtimer_restart { + HRTIMER_NORESTART, /* Timer is not restarted */ + HRTIMER_RESTART, /* Timer must be restarted */ +}; + +/** + * struct hrtimer - the basic hrtimer structure + * @node: timerqueue node, which also manages node.expires, + * the absolute expiry time in the hrtimers internal + * representation. The time is related to the clock on + * which the timer is based. Is setup by adding + * slack to the _softexpires value. For non range timers + * identical to _softexpires. + * @_softexpires: the absolute earliest expiry time of the hrtimer. + * The time which was given as expiry time when the timer + * was armed. + * @function: timer expiry callback function + * @base: pointer to the timer base (per cpu and per clock) + * @state: state information (See bit values above) + * @is_rel: Set if the timer was armed relative + * @is_soft: Set if hrtimer will be expired in soft interrupt context. + * @is_hard: Set if hrtimer will be expired in hard interrupt context + * even on RT. + * + * The hrtimer structure must be initialized by hrtimer_init() + */ +struct hrtimer { + struct timerqueue_node node; + ktime_t _softexpires; + enum hrtimer_restart (*function)(struct hrtimer *); + struct hrtimer_clock_base *base; + u8 state; + u8 is_rel; + u8 is_soft; + u8 is_hard; +}; + +#endif /* _LINUX_HRTIMER_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 7501a3451a201..3762809652dac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3 From d84f317915172c6511fd6c14ea3f70c9d67fdf67 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 12:35:44 -0500 Subject: locking/mutex: split out mutex_types.h Trimming down sched.h dependencies: we don't want to include more than the base types. Signed-off-by: Kent Overstreet Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Will Deacon Cc: Waiman Long Cc: Boqun Feng Signed-off-by: Kent Overstreet --- include/linux/mutex.h | 52 +-------------------------------- include/linux/mutex_types.h | 71 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 2 +- 3 files changed, 73 insertions(+), 52 deletions(-) create mode 100644 include/linux/mutex_types.h (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index a33aa9eb9fc3b..0dfba5df65243 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -20,6 +20,7 @@ #include #include #include +#include #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ @@ -33,49 +34,6 @@ #ifndef CONFIG_PREEMPT_RT -/* - * Simple, straightforward mutexes with strict semantics: - * - * - only one task can hold the mutex at a time - * - only the owner can unlock the mutex - * - multiple unlocks are not permitted - * - recursive locking is not permitted - * - a mutex object must be initialized via the API - * - a mutex object must not be initialized via memset or copying - * - task may not exit with mutex held - * - memory areas where held locks reside must not be freed - * - held mutexes must not be reinitialized - * - mutexes may not be used in hardware or software interrupt - * contexts such as tasklets and timers - * - * These semantics are fully enforced when DEBUG_MUTEXES is - * enabled. Furthermore, besides enforcing the above rules, the mutex - * debugging code also implements a number of additional features - * that make lock debugging easier and faster: - * - * - uses symbolic names of mutexes, whenever they are printed in debug output - * - point-of-acquire tracking, symbolic lookup of function names - * - list of all locks held in the system, printout of them - * - owner tracking - * - detects self-recursing locks and prints out all relevant info - * - detects multi-task circular deadlocks and prints out all affected - * locks and tasks (and only those tasks) - */ -struct mutex { - atomic_long_t owner; - raw_spinlock_t wait_lock; -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - struct optimistic_spin_queue osq; /* Spinner MCS lock */ -#endif - struct list_head wait_list; -#ifdef CONFIG_DEBUG_MUTEXES - void *magic; -#endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; - #ifdef CONFIG_DEBUG_MUTEXES #define __DEBUG_MUTEX_INITIALIZER(lockname) \ @@ -131,14 +89,6 @@ extern bool mutex_is_locked(struct mutex *lock); /* * Preempt-RT variant based on rtmutexes. */ -#include - -struct mutex { - struct rt_mutex_base rtmutex; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -}; #define __MUTEX_INITIALIZER(mutexname) \ { \ diff --git a/include/linux/mutex_types.h b/include/linux/mutex_types.h new file mode 100644 index 0000000000000..fdf7f515fde8e --- /dev/null +++ b/include/linux/mutex_types.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_MUTEX_TYPES_H +#define __LINUX_MUTEX_TYPES_H + +#include +#include +#include +#include +#include + +#ifndef CONFIG_PREEMPT_RT + +/* + * Simple, straightforward mutexes with strict semantics: + * + * - only one task can hold the mutex at a time + * - only the owner can unlock the mutex + * - multiple unlocks are not permitted + * - recursive locking is not permitted + * - a mutex object must be initialized via the API + * - a mutex object must not be initialized via memset or copying + * - task may not exit with mutex held + * - memory areas where held locks reside must not be freed + * - held mutexes must not be reinitialized + * - mutexes may not be used in hardware or software interrupt + * contexts such as tasklets and timers + * + * These semantics are fully enforced when DEBUG_MUTEXES is + * enabled. Furthermore, besides enforcing the above rules, the mutex + * debugging code also implements a number of additional features + * that make lock debugging easier and faster: + * + * - uses symbolic names of mutexes, whenever they are printed in debug output + * - point-of-acquire tracking, symbolic lookup of function names + * - list of all locks held in the system, printout of them + * - owner tracking + * - detects self-recursing locks and prints out all relevant info + * - detects multi-task circular deadlocks and prints out all affected + * locks and tasks (and only those tasks) + */ +struct mutex { + atomic_long_t owner; + raw_spinlock_t wait_lock; +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER + struct optimistic_spin_queue osq; /* Spinner MCS lock */ +#endif + struct list_head wait_list; +#ifdef CONFIG_DEBUG_MUTEXES + void *magic; +#endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#else /* !CONFIG_PREEMPT_RT */ +/* + * Preempt-RT variant based on rtmutexes. + */ +#include + +struct mutex { + struct rt_mutex_base rtmutex; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#endif /* CONFIG_PREEMPT_RT */ + +#endif /* __LINUX_MUTEX_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3762809652dac..e8892789969b7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3 From 53d31ba842d9cc391032d051a210c3c9941f1529 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 12:43:30 -0500 Subject: posix-cpu-timers: Split out posix-timers_types.h Trimming down sched.h dependencies: we don't want to include more than the base types. Cc: Thomas Gleixner Signed-off-by: Kent Overstreet --- include/linux/posix-timers.h | 69 ++------------------------------ include/linux/posix-timers_types.h | 80 ++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 2 +- 3 files changed, 84 insertions(+), 67 deletions(-) create mode 100644 include/linux/posix-timers_types.h (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index d607f51404fca..dc7b738de2998 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -2,40 +2,16 @@ #ifndef _linux_POSIX_TIMERS_H #define _linux_POSIX_TIMERS_H -#include +#include #include #include -#include +#include +#include #include struct kernel_siginfo; struct task_struct; -/* - * Bit fields within a clockid: - * - * The most significant 29 bits hold either a pid or a file descriptor. - * - * Bit 2 indicates whether a cpu clock refers to a thread or a process. - * - * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3. - * - * A clockid is invalid if bits 2, 1, and 0 are all set. - */ -#define CPUCLOCK_PID(clock) ((pid_t) ~((clock) >> 3)) -#define CPUCLOCK_PERTHREAD(clock) \ - (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0) - -#define CPUCLOCK_PERTHREAD_MASK 4 -#define CPUCLOCK_WHICH(clock) ((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK) -#define CPUCLOCK_CLOCK_MASK 3 -#define CPUCLOCK_PROF 0 -#define CPUCLOCK_VIRT 1 -#define CPUCLOCK_SCHED 2 -#define CPUCLOCK_MAX 3 -#define CLOCKFD CPUCLOCK_MAX -#define CLOCKFD_MASK (CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK) - static inline clockid_t make_process_cpuclock(const unsigned int pid, const clockid_t clock) { @@ -109,44 +85,6 @@ static inline void cpu_timer_setexpires(struct cpu_timer *ctmr, u64 exp) ctmr->node.expires = exp; } -/** - * posix_cputimer_base - Container per posix CPU clock - * @nextevt: Earliest-expiration cache - * @tqhead: timerqueue head for cpu_timers - */ -struct posix_cputimer_base { - u64 nextevt; - struct timerqueue_head tqhead; -}; - -/** - * posix_cputimers - Container for posix CPU timer related data - * @bases: Base container for posix CPU clocks - * @timers_active: Timers are queued. - * @expiry_active: Timer expiry is active. Used for - * process wide timers to avoid multiple - * task trying to handle expiry concurrently - * - * Used in task_struct and signal_struct - */ -struct posix_cputimers { - struct posix_cputimer_base bases[CPUCLOCK_MAX]; - unsigned int timers_active; - unsigned int expiry_active; -}; - -/** - * posix_cputimers_work - Container for task work based posix CPU timer expiry - * @work: The task work to be scheduled - * @mutex: Mutex held around expiry in context of this task work - * @scheduled: @work has been scheduled already, no further processing - */ -struct posix_cputimers_work { - struct callback_head work; - struct mutex mutex; - unsigned int scheduled; -}; - static inline void posix_cputimers_init(struct posix_cputimers *pct) { memset(pct, 0, sizeof(*pct)); @@ -179,7 +117,6 @@ static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, .bases = INIT_CPU_TIMERBASES(s.posix_cputimers.bases), \ }, #else -struct posix_cputimers { }; struct cpu_timer { }; #define INIT_CPU_TIMERS(s) static inline void posix_cputimers_init(struct posix_cputimers *pct) { } diff --git a/include/linux/posix-timers_types.h b/include/linux/posix-timers_types.h new file mode 100644 index 0000000000000..4783fa17bfeb1 --- /dev/null +++ b/include/linux/posix-timers_types.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _linux_POSIX_TIMERS_TYPES_H +#define _linux_POSIX_TIMERS_TYPES_H + +#include +#include +#include + +/* + * Bit fields within a clockid: + * + * The most significant 29 bits hold either a pid or a file descriptor. + * + * Bit 2 indicates whether a cpu clock refers to a thread or a process. + * + * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3. + * + * A clockid is invalid if bits 2, 1, and 0 are all set. + */ +#define CPUCLOCK_PID(clock) ((pid_t) ~((clock) >> 3)) +#define CPUCLOCK_PERTHREAD(clock) \ + (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0) + +#define CPUCLOCK_PERTHREAD_MASK 4 +#define CPUCLOCK_WHICH(clock) ((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK) +#define CPUCLOCK_CLOCK_MASK 3 +#define CPUCLOCK_PROF 0 +#define CPUCLOCK_VIRT 1 +#define CPUCLOCK_SCHED 2 +#define CPUCLOCK_MAX 3 +#define CLOCKFD CPUCLOCK_MAX +#define CLOCKFD_MASK (CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK) + +#ifdef CONFIG_POSIX_TIMERS + +/** + * posix_cputimer_base - Container per posix CPU clock + * @nextevt: Earliest-expiration cache + * @tqhead: timerqueue head for cpu_timers + */ +struct posix_cputimer_base { + u64 nextevt; + struct timerqueue_head tqhead; +}; + +/** + * posix_cputimers - Container for posix CPU timer related data + * @bases: Base container for posix CPU clocks + * @timers_active: Timers are queued. + * @expiry_active: Timer expiry is active. Used for + * process wide timers to avoid multiple + * task trying to handle expiry concurrently + * + * Used in task_struct and signal_struct + */ +struct posix_cputimers { + struct posix_cputimer_base bases[CPUCLOCK_MAX]; + unsigned int timers_active; + unsigned int expiry_active; +}; + +/** + * posix_cputimers_work - Container for task work based posix CPU timer expiry + * @work: The task work to be scheduled + * @mutex: Mutex held around expiry in context of this task work + * @scheduled: @work has been scheduled already, no further processing + */ +struct posix_cputimers_work { + struct callback_head work; + struct mutex mutex; + unsigned int scheduled; +}; + +#else /* CONFIG_POSIX_TIMERS */ + +struct posix_cputimers { }; + +#endif /* CONFIG_POSIX_TIMERS */ + +#endif /* _linux_POSIX_TIMERS_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index e8892789969b7..6d803d0904d94 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3 From f038cc1379c0ff462d83895cae8beb75a0f6bf02 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 13:01:06 -0500 Subject: locking/seqlock: Split out seqlock_types.h Trimming down sched.h dependencies: we don't want to include more than the base types. Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Will Deacon Cc: Waiman Long Cc: Boqun Feng Signed-off-by: Kent Overstreet --- include/linux/sched.h | 2 +- include/linux/seqlock.h | 79 +----------------------------------- include/linux/seqlock_types.h | 93 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 78 deletions(-) create mode 100644 include/linux/seqlock_types.h (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6d803d0904d94..436f7ce1450af 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index e92f9d5577bac..d90d8ee29d811 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -37,37 +38,6 @@ */ #define KCSAN_SEQLOCK_REGION_MAX 1000 -/* - * Sequence counters (seqcount_t) - * - * This is the raw counting mechanism, without any writer protection. - * - * Write side critical sections must be serialized and non-preemptible. - * - * If readers can be invoked from hardirq or softirq contexts, - * interrupts or bottom halves must also be respectively disabled before - * entering the write section. - * - * This mechanism can't be used if the protected data contains pointers, - * as the writer can invalidate a pointer that a reader is following. - * - * If the write serialization mechanism is one of the common kernel - * locking primitives, use a sequence counter with associated lock - * (seqcount_LOCKNAME_t) instead. - * - * If it's desired to automatically handle the sequence counter writer - * serialization and non-preemptibility requirements, use a sequential - * lock (seqlock_t) instead. - * - * See Documentation/locking/seqlock.rst - */ -typedef struct seqcount { - unsigned sequence; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -} seqcount_t; - static inline void __seqcount_init(seqcount_t *s, const char *name, struct lock_class_key *key) { @@ -131,28 +101,6 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) * See Documentation/locking/seqlock.rst */ -/* - * For PREEMPT_RT, seqcount_LOCKNAME_t write side critical sections cannot - * disable preemption. It can lead to higher latencies, and the write side - * sections will not be able to acquire locks which become sleeping locks - * (e.g. spinlock_t). - * - * To remain preemptible while avoiding a possible livelock caused by the - * reader preempting the writer, use a different technique: let the reader - * detect if a seqcount_LOCKNAME_t writer is in progress. If that is the - * case, acquire then release the associated LOCKNAME writer serialization - * lock. This will allow any possibly-preempted writer to make progress - * until the end of its writer serialization lock critical section. - * - * This lock-unlock technique must be implemented for all of PREEMPT_RT - * sleeping locks. See Documentation/locking/locktypes.rst - */ -#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT) -#define __SEQ_LOCK(expr) expr -#else -#define __SEQ_LOCK(expr) -#endif - /* * typedef seqcount_LOCKNAME_t - sequence counter with LOCKNAME associated * @seqcount: The real sequence counter @@ -194,11 +142,6 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s) * @lockbase: prefix for associated lock/unlock */ #define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase) \ -typedef struct seqcount_##lockname { \ - seqcount_t seqcount; \ - __SEQ_LOCK(locktype *lock); \ -} seqcount_##lockname##_t; \ - \ static __always_inline seqcount_t * \ __seqprop_##lockname##_ptr(seqcount_##lockname##_t *s) \ { \ @@ -284,6 +227,7 @@ SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t, false, raw_spin) SEQCOUNT_LOCKNAME(spinlock, spinlock_t, __SEQ_RT, spin) SEQCOUNT_LOCKNAME(rwlock, rwlock_t, __SEQ_RT, read) SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) +#undef SEQCOUNT_LOCKNAME /* * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t @@ -794,25 +738,6 @@ static inline void raw_write_seqcount_latch(seqcount_latch_t *s) smp_wmb(); /* increment "sequence" before following stores */ } -/* - * Sequential locks (seqlock_t) - * - * Sequence counters with an embedded spinlock for writer serialization - * and non-preemptibility. - * - * For more info, see: - * - Comments on top of seqcount_t - * - Documentation/locking/seqlock.rst - */ -typedef struct { - /* - * Make sure that readers don't starve writers on PREEMPT_RT: use - * seqcount_spinlock_t instead of seqcount_t. Check __SEQ_LOCK(). - */ - seqcount_spinlock_t seqcount; - spinlock_t lock; -} seqlock_t; - #define __SEQLOCK_UNLOCKED(lockname) \ { \ .seqcount = SEQCNT_SPINLOCK_ZERO(lockname, &(lockname).lock), \ diff --git a/include/linux/seqlock_types.h b/include/linux/seqlock_types.h new file mode 100644 index 0000000000000..dfdf43e3fa3de --- /dev/null +++ b/include/linux/seqlock_types.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_SEQLOCK_TYPES_H +#define __LINUX_SEQLOCK_TYPES_H + +#include +#include +#include + +/* + * Sequence counters (seqcount_t) + * + * This is the raw counting mechanism, without any writer protection. + * + * Write side critical sections must be serialized and non-preemptible. + * + * If readers can be invoked from hardirq or softirq contexts, + * interrupts or bottom halves must also be respectively disabled before + * entering the write section. + * + * This mechanism can't be used if the protected data contains pointers, + * as the writer can invalidate a pointer that a reader is following. + * + * If the write serialization mechanism is one of the common kernel + * locking primitives, use a sequence counter with associated lock + * (seqcount_LOCKNAME_t) instead. + * + * If it's desired to automatically handle the sequence counter writer + * serialization and non-preemptibility requirements, use a sequential + * lock (seqlock_t) instead. + * + * See Documentation/locking/seqlock.rst + */ +typedef struct seqcount { + unsigned sequence; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} seqcount_t; + +/* + * For PREEMPT_RT, seqcount_LOCKNAME_t write side critical sections cannot + * disable preemption. It can lead to higher latencies, and the write side + * sections will not be able to acquire locks which become sleeping locks + * (e.g. spinlock_t). + * + * To remain preemptible while avoiding a possible livelock caused by the + * reader preempting the writer, use a different technique: let the reader + * detect if a seqcount_LOCKNAME_t writer is in progress. If that is the + * case, acquire then release the associated LOCKNAME writer serialization + * lock. This will allow any possibly-preempted writer to make progress + * until the end of its writer serialization lock critical section. + * + * This lock-unlock technique must be implemented for all of PREEMPT_RT + * sleeping locks. See Documentation/locking/locktypes.rst + */ +#if defined(CONFIG_LOCKDEP) || defined(CONFIG_PREEMPT_RT) +#define __SEQ_LOCK(expr) expr +#else +#define __SEQ_LOCK(expr) +#endif + +#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase) \ +typedef struct seqcount_##lockname { \ + seqcount_t seqcount; \ + __SEQ_LOCK(locktype *lock); \ +} seqcount_##lockname##_t; + +SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t, false, raw_spin) +SEQCOUNT_LOCKNAME(spinlock, spinlock_t, __SEQ_RT, spin) +SEQCOUNT_LOCKNAME(rwlock, rwlock_t, __SEQ_RT, read) +SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) +#undef SEQCOUNT_LOCKNAME + +/* + * Sequential locks (seqlock_t) + * + * Sequence counters with an embedded spinlock for writer serialization + * and non-preemptibility. + * + * For more info, see: + * - Comments on top of seqcount_t + * - Documentation/locking/seqlock.rst + */ +typedef struct { + /* + * Make sure that readers don't starve writers on PREEMPT_RT: use + * seqcount_spinlock_t instead of seqcount_t. Check __SEQ_LOCK(). + */ + seqcount_spinlock_t seqcount; + spinlock_t lock; +} seqlock_t; + +#endif /* __LINUX_SEQLOCK_TYPES_H */ -- cgit v1.2.3 From 6d5e9d63683042a8d344cd5d6f9cf23613864a29 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 13:03:22 -0500 Subject: pid: Split out pid_types.h Trimming down sched.h dependencies: we dont't want to include more than the base types. Cc: Kees Cook Cc: Andy Lutomirski Cc: Will Drewry Signed-off-by: Kent Overstreet --- include/linux/pid.h | 15 ++------------- include/linux/pid_types.h | 16 ++++++++++++++++ include/linux/sched.h | 2 +- include/linux/seccomp.h | 2 ++ 4 files changed, 21 insertions(+), 14 deletions(-) create mode 100644 include/linux/pid_types.h (limited to 'include/linux') diff --git a/include/linux/pid.h b/include/linux/pid.h index 653a527574c4d..f254c3a45b9be 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -2,18 +2,10 @@ #ifndef _LINUX_PID_H #define _LINUX_PID_H +#include #include -#include #include - -enum pid_type -{ - PIDTYPE_PID, - PIDTYPE_TGID, - PIDTYPE_PGID, - PIDTYPE_SID, - PIDTYPE_MAX, -}; +#include /* * What is struct pid? @@ -110,9 +102,6 @@ extern void exchange_tids(struct task_struct *task, struct task_struct *old); extern void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type); -struct pid_namespace; -extern struct pid_namespace init_pid_ns; - extern int pid_max; extern int pid_max_min, pid_max_max; diff --git a/include/linux/pid_types.h b/include/linux/pid_types.h new file mode 100644 index 0000000000000..c2aee1d91dcfd --- /dev/null +++ b/include/linux/pid_types.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PID_TYPES_H +#define _LINUX_PID_TYPES_H + +enum pid_type { + PIDTYPE_PID, + PIDTYPE_TGID, + PIDTYPE_PGID, + PIDTYPE_SID, + PIDTYPE_MAX, +}; + +struct pid_namespace; +extern struct pid_namespace init_pid_ns; + +#endif /* _LINUX_PID_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 436f7ce1450af..37cc9d2570737 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -11,7 +11,7 @@ #include -#include +#include #include #include #include diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 175079552f68d..1ec0d8dc4b69d 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -126,6 +126,8 @@ static inline long seccomp_get_metadata(struct task_struct *task, #ifdef CONFIG_SECCOMP_CACHE_DEBUG struct seq_file; +struct pid_namespace; +struct pid; int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); -- cgit v1.2.3 From f551103cb964e9e6f5c03b3b8723424723731e76 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 15 Dec 2023 17:49:24 -0500 Subject: sched.h: move pid helpers to pid.h This is needed for killing the sched.h dependency on rcupdate.h, and pid.h is a better place for this code anyways. Signed-off-by: Kent Overstreet --- include/linux/pid.h | 125 +++++++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 122 ----------------------------------------- include/linux/sched/signal.h | 1 + 3 files changed, 126 insertions(+), 122 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pid.h b/include/linux/pid.h index f254c3a45b9be..395cacce1179c 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -4,7 +4,9 @@ #include #include +#include #include +#include #include /* @@ -204,4 +206,127 @@ pid_t pid_vnr(struct pid *pid); } \ task = tg___; \ } while_each_pid_task(pid, type, task) + +static inline struct pid *task_pid(struct task_struct *task) +{ + return task->thread_pid; +} + +/* + * the helpers to get the task's different pids as they are seen + * from various namespaces + * + * task_xid_nr() : global id, i.e. the id seen from the init namespace; + * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of + * current. + * task_xid_nr_ns() : id seen from the ns specified; + * + * see also pid_nr() etc in include/linux/pid.h + */ +pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns); + +static inline pid_t task_pid_nr(struct task_struct *tsk) +{ + return tsk->pid; +} + +static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); +} + +static inline pid_t task_pid_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); +} + + +static inline pid_t task_tgid_nr(struct task_struct *tsk) +{ + return tsk->tgid; +} + +/** + * pid_alive - check that a task structure is not stale + * @p: Task structure to be checked. + * + * Test if a process is not yet dead (at most zombie state) + * If pid_alive fails, then pointers within the task structure + * can be stale and must not be dereferenced. + * + * Return: 1 if the process is alive. 0 otherwise. + */ +static inline int pid_alive(const struct task_struct *p) +{ + return p->thread_pid != NULL; +} + +static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); +} + +static inline pid_t task_pgrp_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); +} + + +static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); +} + +static inline pid_t task_session_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); +} + +static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns); +} + +static inline pid_t task_tgid_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL); +} + +static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) +{ + pid_t pid = 0; + + rcu_read_lock(); + if (pid_alive(tsk)) + pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); + rcu_read_unlock(); + + return pid; +} + +static inline pid_t task_ppid_nr(const struct task_struct *tsk) +{ + return task_ppid_nr_ns(tsk, &init_pid_ns); +} + +/* Obsolete, do not use: */ +static inline pid_t task_pgrp_nr(struct task_struct *tsk) +{ + return task_pgrp_nr_ns(tsk, &init_pid_ns); +} + +/** + * is_global_init - check if a task structure is init. Since init + * is free to have sub-threads we need to check tgid. + * @tsk: Task structure to be checked. + * + * Check if a task structure is the first user space task the kernel created. + * + * Return: 1 if the task structure is init. 0 otherwise. + */ +static inline int is_global_init(struct task_struct *tsk) +{ + return task_tgid_nr(tsk) == 1; +} + #endif /* _LINUX_PID_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 37cc9d2570737..9e2708c2cfa66 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1561,114 +1561,6 @@ struct task_struct { */ }; -static inline struct pid *task_pid(struct task_struct *task) -{ - return task->thread_pid; -} - -/* - * the helpers to get the task's different pids as they are seen - * from various namespaces - * - * task_xid_nr() : global id, i.e. the id seen from the init namespace; - * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of - * current. - * task_xid_nr_ns() : id seen from the ns specified; - * - * see also pid_nr() etc in include/linux/pid.h - */ -pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns); - -static inline pid_t task_pid_nr(struct task_struct *tsk) -{ - return tsk->pid; -} - -static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); -} - -static inline pid_t task_pid_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); -} - - -static inline pid_t task_tgid_nr(struct task_struct *tsk) -{ - return tsk->tgid; -} - -/** - * pid_alive - check that a task structure is not stale - * @p: Task structure to be checked. - * - * Test if a process is not yet dead (at most zombie state) - * If pid_alive fails, then pointers within the task structure - * can be stale and must not be dereferenced. - * - * Return: 1 if the process is alive. 0 otherwise. - */ -static inline int pid_alive(const struct task_struct *p) -{ - return p->thread_pid != NULL; -} - -static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); -} - -static inline pid_t task_pgrp_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); -} - - -static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); -} - -static inline pid_t task_session_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); -} - -static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns); -} - -static inline pid_t task_tgid_vnr(struct task_struct *tsk) -{ - return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL); -} - -static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) -{ - pid_t pid = 0; - - rcu_read_lock(); - if (pid_alive(tsk)) - pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); - rcu_read_unlock(); - - return pid; -} - -static inline pid_t task_ppid_nr(const struct task_struct *tsk) -{ - return task_ppid_nr_ns(tsk, &init_pid_ns); -} - -/* Obsolete, do not use: */ -static inline pid_t task_pgrp_nr(struct task_struct *tsk) -{ - return task_pgrp_nr_ns(tsk, &init_pid_ns); -} - #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) @@ -1712,20 +1604,6 @@ static inline char task_state_to_char(struct task_struct *tsk) return task_index_to_char(task_state_index(tsk)); } -/** - * is_global_init - check if a task structure is init. Since init - * is free to have sub-threads we need to check tgid. - * @tsk: Task structure to be checked. - * - * Check if a task structure is the first user space task the kernel created. - * - * Return: 1 if the task structure is init. 0 otherwise. - */ -static inline int is_global_init(struct task_struct *tsk) -{ - return task_tgid_nr(tsk) == 1; -} - extern struct pid *cad_pid; /* diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 3499c1a8b9295..b847d8fa75a97 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 8b7787a543cde905e53eaf29172c9472fe8a6a75 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 13:12:49 -0500 Subject: plist: Split out plist_types.h Trimming down sched.h dependencies: we don't want to include more than the base types. Signed-off-by: Kent Overstreet --- include/linux/plist.h | 12 +----------- include/linux/plist_types.h | 17 +++++++++++++++++ include/linux/sched.h | 2 +- 3 files changed, 19 insertions(+), 12 deletions(-) create mode 100644 include/linux/plist_types.h (limited to 'include/linux') diff --git a/include/linux/plist.h b/include/linux/plist.h index 0f352c1d3c805..8c1c8adf7fe94 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -75,20 +75,10 @@ #include #include -#include +#include #include -struct plist_head { - struct list_head node_list; -}; - -struct plist_node { - int prio; - struct list_head prio_list; - struct list_head node_list; -}; - /** * PLIST_HEAD_INIT - static struct plist_head initializer * @head: struct plist_head variable name diff --git a/include/linux/plist_types.h b/include/linux/plist_types.h new file mode 100644 index 0000000000000..c37e784330af5 --- /dev/null +++ b/include/linux/plist_types.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _LINUX_PLIST_TYPES_H +#define _LINUX_PLIST_TYPES_H + +#include + +struct plist_head { + struct list_head node_list; +}; + +struct plist_node { + int prio; + struct list_head prio_list; + struct list_head node_list; +}; + +#endif /* _LINUX_PLIST_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 9e2708c2cfa66..8c230f24688bd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3 From 6dfeff09d5ad331905c7066207053d286d58ac83 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 11 Dec 2023 18:14:41 +0000 Subject: wait: Remove uapi header file from main header file There's really no overlap between uapi/linux/wait.h and linux/wait.h. There are two files which rely on the uapi file being implcitly included, so explicitly include it there and remove it from the main header file. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Kent Overstreet Reviewed-by: Christian Brauner --- include/linux/wait.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 3473b663176f1..8aa3372f21a08 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -9,7 +9,6 @@ #include #include -#include typedef struct wait_queue_entry wait_queue_entry_t; -- cgit v1.2.3 From 097691960f7084ca82adb2e866d03a81753f0cb7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 13:28:46 -0500 Subject: rslib: kill bogus dependency on list.h list_head is defined in types.h, not list.h - this kills a sched.h dependency. Signed-off-by: Kent Overstreet --- include/linux/rslib.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rslib.h b/include/linux/rslib.h index 238bb85243d36..a04dacbdc8ae9 100644 --- a/include/linux/rslib.h +++ b/include/linux/rslib.h @@ -10,7 +10,6 @@ #ifndef _RSLIB_H_ #define _RSLIB_H_ -#include #include /* for gfp_t */ #include /* for GFP_KERNEL */ -- cgit v1.2.3 From eee51b0ae5c52a77ed65ad59b55002d1397b40d5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 13:32:55 -0500 Subject: timerqueue: Split out timerqueue_types.h Trimming down sched.h dependencies: timerqueue_types can include just rbtree_types.h instead of pulling in rbtree.h. Cc: Thomas Gleixner Signed-off-by: Kent Overstreet --- include/linux/hrtimer_types.h | 2 +- include/linux/posix-timers_types.h | 2 +- include/linux/timerqueue.h | 13 +------------ include/linux/timerqueue_types.h | 17 +++++++++++++++++ 4 files changed, 20 insertions(+), 14 deletions(-) create mode 100644 include/linux/timerqueue_types.h (limited to 'include/linux') diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h index f4ef391b96a7a..ad66a30817357 100644 --- a/include/linux/hrtimer_types.h +++ b/include/linux/hrtimer_types.h @@ -3,7 +3,7 @@ #define _LINUX_HRTIMER_TYPES_H #include -#include +#include struct hrtimer_clock_base; diff --git a/include/linux/posix-timers_types.h b/include/linux/posix-timers_types.h index 4783fa17bfeb1..a4712c1008c97 100644 --- a/include/linux/posix-timers_types.h +++ b/include/linux/posix-timers_types.h @@ -3,7 +3,7 @@ #define _linux_POSIX_TIMERS_TYPES_H #include -#include +#include #include /* diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h index adc80e29168ea..62973f7d4610f 100644 --- a/include/linux/timerqueue.h +++ b/include/linux/timerqueue.h @@ -3,18 +3,7 @@ #define _LINUX_TIMERQUEUE_H #include -#include - - -struct timerqueue_node { - struct rb_node node; - ktime_t expires; -}; - -struct timerqueue_head { - struct rb_root_cached rb_root; -}; - +#include extern bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node); diff --git a/include/linux/timerqueue_types.h b/include/linux/timerqueue_types.h new file mode 100644 index 0000000000000..dc298d0923e3b --- /dev/null +++ b/include/linux/timerqueue_types.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_TIMERQUEUE_TYPES_H +#define _LINUX_TIMERQUEUE_TYPES_H + +#include +#include + +struct timerqueue_node { + struct rb_node node; + ktime_t expires; +}; + +struct timerqueue_head { + struct rb_root_cached rb_root; +}; + +#endif /* _LINUX_TIMERQUEUE_TYPES_H */ -- cgit v1.2.3 From 22c336d0d3118824fed08834069568c57c5641a6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 13:34:45 -0500 Subject: signal: Kill bogus dependency on list.h list_head is in types.h, not list.h. Signed-off-by: Kent Overstreet --- include/linux/signal.h | 1 + include/linux/signal_types.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/signal.h b/include/linux/signal.h index 3b98e7a28538b..f19816832f055 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -3,6 +3,7 @@ #define _LINUX_SIGNAL_H #include +#include #include #include diff --git a/include/linux/signal_types.h b/include/linux/signal_types.h index a70b2bdbf4d96..caf4f7a59ab96 100644 --- a/include/linux/signal_types.h +++ b/include/linux/signal_types.h @@ -6,7 +6,7 @@ * Basic signal handling related data type definitions: */ -#include +#include #include typedef struct kernel_siginfo { -- cgit v1.2.3 From dff0fd233a5104337069603d201f8cad74bc0e5a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 13:53:34 -0500 Subject: timers: Split out timer_types.h Cutting down on sched.h dependencies: this is going to be used in workqueue_types.h in the next patch, so we can kill the sched.h dependency on workqueue.h. Signed-off-by: Kent Overstreet --- include/linux/timer.h | 16 +--------------- include/linux/timer_types.h | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+), 15 deletions(-) create mode 100644 include/linux/timer_types.h (limited to 'include/linux') diff --git a/include/linux/timer.h b/include/linux/timer.h index 26a545bb0153c..f18a2f1eb79e2 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -7,21 +7,7 @@ #include #include #include - -struct timer_list { - /* - * All fields that change during normal runtime grouped to the - * same cacheline - */ - struct hlist_node entry; - unsigned long expires; - void (*function)(struct timer_list *); - u32 flags; - -#ifdef CONFIG_LOCKDEP - struct lockdep_map lockdep_map; -#endif -}; +#include #ifdef CONFIG_LOCKDEP /* diff --git a/include/linux/timer_types.h b/include/linux/timer_types.h new file mode 100644 index 0000000000000..fae5a388f9149 --- /dev/null +++ b/include/linux/timer_types.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_TIMER_TYPES_H +#define _LINUX_TIMER_TYPES_H + +#include +#include + +struct timer_list { + /* + * All fields that change during normal runtime grouped to the + * same cacheline + */ + struct hlist_node entry; + unsigned long expires; + void (*function)(struct timer_list *); + u32 flags; + +#ifdef CONFIG_LOCKDEP + struct lockdep_map lockdep_map; +#endif +}; + +#endif /* _LINUX_TIMER_TYPES_H */ -- cgit v1.2.3 From b2fa8443db320c4873feca2588b957439e350890 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 13:55:01 -0500 Subject: workqueue: Split out workqueue_types.h More sched.h dependency culling - this lets us kill a rhashtable-types.h dependency on workqueue.h. Signed-off-by: Kent Overstreet --- include/linux/dma-fence.h | 1 + include/linux/rhashtable-types.h | 2 +- include/linux/timekeeping.h | 1 + include/linux/workqueue.h | 16 +--------------- include/linux/workqueue_types.h | 25 +++++++++++++++++++++++++ 5 files changed, 29 insertions(+), 16 deletions(-) create mode 100644 include/linux/workqueue_types.h (limited to 'include/linux') diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index b3772edca2e6e..e06bad467f55e 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -21,6 +21,7 @@ #include #include #include +#include struct dma_fence; struct dma_fence_ops; diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h index 57467cbf4c5b1..b6f3797277ff8 100644 --- a/include/linux/rhashtable-types.h +++ b/include/linux/rhashtable-types.h @@ -12,7 +12,7 @@ #include #include #include -#include +#include struct rhash_head { struct rhash_head __rcu *next; diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index fe1e467ba046f..7c43e98cf2115 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -4,6 +4,7 @@ #include #include +#include /* Included from linux/ktime.h */ diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 24b1e5070f4d4..f1bb2e35301f9 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -14,12 +14,7 @@ #include #include #include - -struct workqueue_struct; - -struct work_struct; -typedef void (*work_func_t)(struct work_struct *work); -void delayed_work_timer_fn(struct timer_list *t); +#include /* * The first word is the work queue pointer and the flags rolled into @@ -95,15 +90,6 @@ enum { #define WORK_STRUCT_FLAG_MASK ((1ul << WORK_STRUCT_FLAG_BITS) - 1) #define WORK_STRUCT_WQ_DATA_MASK (~WORK_STRUCT_FLAG_MASK) -struct work_struct { - atomic_long_t data; - struct list_head entry; - work_func_t func; -#ifdef CONFIG_LOCKDEP - struct lockdep_map lockdep_map; -#endif -}; - #define WORK_DATA_INIT() ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL) #define WORK_DATA_STATIC_INIT() \ ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC)) diff --git a/include/linux/workqueue_types.h b/include/linux/workqueue_types.h new file mode 100644 index 0000000000000..4c38824f3ab4f --- /dev/null +++ b/include/linux/workqueue_types.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_WORKQUEUE_TYPES_H +#define _LINUX_WORKQUEUE_TYPES_H + +#include +#include +#include +#include + +struct workqueue_struct; + +struct work_struct; +typedef void (*work_func_t)(struct work_struct *work); +void delayed_work_timer_fn(struct timer_list *t); + +struct work_struct { + atomic_long_t data; + struct list_head entry; + work_func_t func; +#ifdef CONFIG_LOCKDEP + struct lockdep_map lockdep_map; +#endif +}; + +#endif /* _LINUX_WORKQUEUE_TYPES_H */ -- cgit v1.2.3 From bc46ef3cea3d6f63952d7e29a324e889c34970a8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 13:58:25 -0500 Subject: shm: Slim down dependencies list_head is in types.h, not list.h., and the uapi header wasn't needed. Signed-off-by: Kent Overstreet --- include/linux/shm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shm.h b/include/linux/shm.h index d8e69aed3d322..c55bef0538e58 100644 --- a/include/linux/shm.h +++ b/include/linux/shm.h @@ -2,12 +2,12 @@ #ifndef _LINUX_SHM_H_ #define _LINUX_SHM_H_ -#include +#include #include -#include #include struct file; +struct task_struct; #ifdef CONFIG_SYSVIPC struct sysv_shm { -- cgit v1.2.3 From 72375a8864ebc0a20ca4a35f382441b01a0b85b9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 14:00:10 -0500 Subject: ipc: Kill bogus dependency on spinlock.h pruning sched.h dependencies, headers shouldn't pull in more than they need. Signed-off-by: Kent Overstreet --- include/linux/ipc.h | 2 +- include/linux/sched.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ipc.h b/include/linux/ipc.h index e1c9eea6015b5..9b1434247aab7 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -2,7 +2,7 @@ #ifndef _LINUX_IPC_H #define _LINUX_IPC_H -#include +#include #include #include #include diff --git a/include/linux/sched.h b/include/linux/sched.h index 8c230f24688bd..34400b16e57ce 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2139,6 +2139,8 @@ extern bool sched_task_on_rq(struct task_struct *p); extern unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_curr_snapshot(int cpu); +#include + /* * In order to reduce various lock holder preemption latencies provide an * interface to see if a vCPU is currently running or not. -- cgit v1.2.3 From 9983deb26d9021aecd971d25abf4cd263c72c385 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 14:01:25 -0500 Subject: Split out irqflags_types.h We're working on only pulling in type definitions to sched.h whenever possible. Signed-off-by: Kent Overstreet --- include/linux/irqflags.h | 14 +------------- include/linux/irqflags_types.h | 22 ++++++++++++++++++++++ include/linux/sched.h | 2 +- 3 files changed, 24 insertions(+), 14 deletions(-) create mode 100644 include/linux/irqflags_types.h (limited to 'include/linux') diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 2b665c32f5fe6..147feebd508ca 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -12,6 +12,7 @@ #ifndef _LINUX_TRACE_IRQFLAGS_H #define _LINUX_TRACE_IRQFLAGS_H +#include #include #include #include @@ -34,19 +35,6 @@ #ifdef CONFIG_TRACE_IRQFLAGS -/* Per-task IRQ trace events information. */ -struct irqtrace_events { - unsigned int irq_events; - unsigned long hardirq_enable_ip; - unsigned long hardirq_disable_ip; - unsigned int hardirq_enable_event; - unsigned int hardirq_disable_event; - unsigned long softirq_disable_ip; - unsigned long softirq_enable_ip; - unsigned int softirq_disable_event; - unsigned int softirq_enable_event; -}; - DECLARE_PER_CPU(int, hardirqs_enabled); DECLARE_PER_CPU(int, hardirq_context); diff --git a/include/linux/irqflags_types.h b/include/linux/irqflags_types.h new file mode 100644 index 0000000000000..c13f0d915097a --- /dev/null +++ b/include/linux/irqflags_types.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_IRQFLAGS_TYPES_H +#define _LINUX_IRQFLAGS_TYPES_H + +#ifdef CONFIG_TRACE_IRQFLAGS + +/* Per-task IRQ trace events information. */ +struct irqtrace_events { + unsigned int irq_events; + unsigned long hardirq_enable_ip; + unsigned long hardirq_disable_ip; + unsigned int hardirq_enable_event; + unsigned int hardirq_disable_event; + unsigned long softirq_disable_ip; + unsigned long softirq_enable_ip; + unsigned int softirq_disable_event; + unsigned int softirq_enable_event; +}; + +#endif + +#endif /* _LINUX_IRQFLAGS_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 34400b16e57ce..026390e6fe59b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -11,6 +11,7 @@ #include +#include #include #include #include @@ -18,7 +19,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 959d8dc8046186ffea5410f51fcb309880f0dfaa Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 14:15:35 -0500 Subject: mm_types_task.h: Trim dependencies more sched.h header dependency trimming Signed-off-by: Kent Overstreet --- include/linux/mm_types_task.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index aa44fff8bb9da..a2f6179b672b8 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -9,9 +9,6 @@ */ #include -#include -#include -#include #include @@ -36,6 +33,8 @@ enum { NR_MM_COUNTERS }; +struct page; + struct page_frag { struct page *page; #if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536) -- cgit v1.2.3 From 55b899aa3e7d0dc02ff9075b883d29eb2d0cb49a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 14:25:40 -0500 Subject: syscall_user_dispatch.h: split out *_types.h thread_info.h pulls in a lot of junk that sched.h that we don't need; in particular, this helps to kill the printk.h dependency. Signed-off-by: Kent Overstreet --- include/linux/sched.h | 2 +- include/linux/syscall_user_dispatch.h | 9 +-------- include/linux/syscall_user_dispatch_types.h | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+), 9 deletions(-) create mode 100644 include/linux/syscall_user_dispatch_types.h (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 026390e6fe59b..925711edd7f70 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/syscall_user_dispatch.h b/include/linux/syscall_user_dispatch.h index 641ca8880995e..3858a6ffdd5c8 100644 --- a/include/linux/syscall_user_dispatch.h +++ b/include/linux/syscall_user_dispatch.h @@ -6,16 +6,10 @@ #define _SYSCALL_USER_DISPATCH_H #include +#include #ifdef CONFIG_GENERIC_ENTRY -struct syscall_user_dispatch { - char __user *selector; - unsigned long offset; - unsigned long len; - bool on_dispatch; -}; - int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, unsigned long len, char __user *selector); @@ -29,7 +23,6 @@ int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long siz void __user *data); #else -struct syscall_user_dispatch {}; static inline int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, unsigned long len, char __user *selector) diff --git a/include/linux/syscall_user_dispatch_types.h b/include/linux/syscall_user_dispatch_types.h new file mode 100644 index 0000000000000..3be36b06c7d70 --- /dev/null +++ b/include/linux/syscall_user_dispatch_types.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _SYSCALL_USER_DISPATCH_TYPES_H +#define _SYSCALL_USER_DISPATCH_TYPES_H + +#include + +#ifdef CONFIG_GENERIC_ENTRY + +struct syscall_user_dispatch { + char __user *selector; + unsigned long offset; + unsigned long len; + bool on_dispatch; +}; + +#else + +struct syscall_user_dispatch {}; + +#endif + +#endif /* _SYSCALL_USER_DISPATCH_TYPES_H */ -- cgit v1.2.3 From f9d6966b7f4182f612208f9dad9e2cfaaf667ba3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 15:15:38 -0500 Subject: refcount: Split out refcount_types.h More trimming of sched.h dependencies. Signed-off-by: Kent Overstreet --- include/linux/refcount.h | 13 +------------ include/linux/refcount_types.h | 19 +++++++++++++++++++ include/linux/sched.h | 2 +- include/linux/sched/task.h | 1 + 4 files changed, 22 insertions(+), 13 deletions(-) create mode 100644 include/linux/refcount_types.h (limited to 'include/linux') diff --git a/include/linux/refcount.h b/include/linux/refcount.h index a62fcca974861..85c6df0d1bef4 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -96,22 +96,11 @@ #include #include #include +#include #include struct mutex; -/** - * typedef refcount_t - variant of atomic_t specialized for reference counts - * @refs: atomic_t counter field - * - * The counter saturates at REFCOUNT_SATURATED and will not move once - * there. This avoids wrapping the counter and causing 'spurious' - * use-after-free bugs. - */ -typedef struct refcount_struct { - atomic_t refs; -} refcount_t; - #define REFCOUNT_INIT(n) { .refs = ATOMIC_INIT(n), } #define REFCOUNT_MAX INT_MAX #define REFCOUNT_SATURATED (INT_MIN / 2) diff --git a/include/linux/refcount_types.h b/include/linux/refcount_types.h new file mode 100644 index 0000000000000..162004f06edf7 --- /dev/null +++ b/include/linux/refcount_types.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_REFCOUNT_TYPES_H +#define _LINUX_REFCOUNT_TYPES_H + +#include + +/** + * typedef refcount_t - variant of atomic_t specialized for reference counts + * @refs: atomic_t counter field + * + * The counter saturates at REFCOUNT_SATURATED and will not move once + * there. This avoids wrapping the counter and causing 'spurious' + * use-after-free bugs. + */ +typedef struct refcount_struct { + atomic_t refs; +} refcount_t; + +#endif /* _LINUX_REFCOUNT_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 925711edd7f70..414e4df701ecd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index a23af225c8983..1880ae21a9cb7 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -7,6 +7,7 @@ * functionality: */ +#include #include #include -- cgit v1.2.3 From a6e1420ce4fc91da56c0a2444c4482245e7617d4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 15:30:14 -0500 Subject: seccomp: Split out seccomp_types.h More pruning of sched.h dependencies. Signed-off-by: Kent Overstreet --- include/linux/sched.h | 2 +- include/linux/seccomp.h | 22 +--------------------- include/linux/seccomp_types.h | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 22 deletions(-) create mode 100644 include/linux/seccomp_types.h (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 414e4df701ecd..db6e3e6296065 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 1ec0d8dc4b69d..709ad84809e1e 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -3,6 +3,7 @@ #define _LINUX_SECCOMP_H #include +#include #define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC | \ SECCOMP_FILTER_FLAG_LOG | \ @@ -21,25 +22,6 @@ #include #include -struct seccomp_filter; -/** - * struct seccomp - the state of a seccomp'ed process - * - * @mode: indicates one of the valid values above for controlled - * system calls available to a process. - * @filter_count: number of seccomp filters - * @filter: must always point to a valid seccomp-filter or NULL as it is - * accessed without locking during system call entry. - * - * @filter must only be accessed from the context of current as there - * is no read locking. - */ -struct seccomp { - int mode; - atomic_t filter_count; - struct seccomp_filter *filter; -}; - #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER extern int __secure_computing(const struct seccomp_data *sd); static inline int secure_computing(void) @@ -64,8 +46,6 @@ static inline int seccomp_mode(struct seccomp *s) #include -struct seccomp { }; -struct seccomp_filter { }; struct seccomp_data; #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER diff --git a/include/linux/seccomp_types.h b/include/linux/seccomp_types.h new file mode 100644 index 0000000000000..cf0a0355024f0 --- /dev/null +++ b/include/linux/seccomp_types.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SECCOMP_TYPES_H +#define _LINUX_SECCOMP_TYPES_H + +#include + +#ifdef CONFIG_SECCOMP + +struct seccomp_filter; +/** + * struct seccomp - the state of a seccomp'ed process + * + * @mode: indicates one of the valid values above for controlled + * system calls available to a process. + * @filter_count: number of seccomp filters + * @filter: must always point to a valid seccomp-filter or NULL as it is + * accessed without locking during system call entry. + * + * @filter must only be accessed from the context of current as there + * is no read locking. + */ +struct seccomp { + int mode; + atomic_t filter_count; + struct seccomp_filter *filter; +}; + +#else + +struct seccomp { }; +struct seccomp_filter { }; + +#endif + +#endif /* _LINUX_SECCOMP_TYPES_H */ -- cgit v1.2.3 From af6da56a223831cb74d1cf006f5742db6403398e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 15:51:30 -0500 Subject: uidgid: Split out uidgid_types.h More sched.h dependency pruning. Signed-off-by: Kent Overstreet Reviewed-by: Christian Brauner --- include/linux/sched.h | 1 + include/linux/uidgid.h | 11 +---------- include/linux/uidgid_types.h | 15 +++++++++++++++ 3 files changed, 17 insertions(+), 10 deletions(-) create mode 100644 include/linux/uidgid_types.h (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index db6e3e6296065..10e25c7fc7936 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -37,6 +37,7 @@ #include #include #include +#include #include /* task_struct member predeclarations (sorted alphabetically): */ diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h index b0542cd11aeb0..ba20b62f13e1d 100644 --- a/include/linux/uidgid.h +++ b/include/linux/uidgid.h @@ -12,21 +12,12 @@ * to detect when we overlook these differences. * */ -#include +#include #include struct user_namespace; extern struct user_namespace init_user_ns; -typedef struct { - uid_t val; -} kuid_t; - - -typedef struct { - gid_t val; -} kgid_t; - #define KUIDT_INIT(value) (kuid_t){ value } #define KGIDT_INIT(value) (kgid_t){ value } diff --git a/include/linux/uidgid_types.h b/include/linux/uidgid_types.h new file mode 100644 index 0000000000000..b35ac4955a334 --- /dev/null +++ b/include/linux/uidgid_types.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_UIDGID_TYPES_H +#define _LINUX_UIDGID_TYPES_H + +#include + +typedef struct { + uid_t val; +} kuid_t; + +typedef struct { + gid_t val; +} kgid_t; + +#endif /* _LINUX_UIDGID_TYPES_H */ -- cgit v1.2.3 From e034d49eb01c7c83a08a3ce2a1091b55f806b26b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 15:52:17 -0500 Subject: sem: Split out sem_types.h More sched.h dependency pruning. Signed-off-by: Kent Overstreet --- include/linux/audit.h | 1 + include/linux/sched.h | 4 +++- include/linux/sem.h | 10 +--------- include/linux/sem_types.h | 13 +++++++++++++ 4 files changed, 18 insertions(+), 10 deletions(-) create mode 100644 include/linux/sem_types.h (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index 51b1b7054a233..0050ef288ab3c 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -36,6 +36,7 @@ struct mqstat; struct audit_watch; struct audit_tree; struct sk_buff; +struct kern_ipc_perm; struct audit_krule { u32 pflags; diff --git a/include/linux/sched.h b/include/linux/sched.h index 10e25c7fc7936..d341a6c302d65 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -11,14 +11,16 @@ #include +#include #include #include -#include +#include #include #include #include #include #include +#include #include #include #include diff --git a/include/linux/sem.h b/include/linux/sem.h index 5608a500c43ea..c4deefe42aeb3 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -3,25 +3,17 @@ #define _LINUX_SEM_H #include +#include struct task_struct; -struct sem_undo_list; #ifdef CONFIG_SYSVIPC -struct sysv_sem { - struct sem_undo_list *undo_list; -}; - extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk); extern void exit_sem(struct task_struct *tsk); #else -struct sysv_sem { - /* empty */ -}; - static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) { return 0; diff --git a/include/linux/sem_types.h b/include/linux/sem_types.h new file mode 100644 index 0000000000000..73df1971a7ae1 --- /dev/null +++ b/include/linux/sem_types.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SEM_TYPES_H +#define _LINUX_SEM_TYPES_H + +struct sem_undo_list; + +struct sysv_sem { +#ifdef CONFIG_SYSVIPC + struct sem_undo_list *undo_list; +#endif +}; + +#endif /* _LINUX_SEM_TYPES_H */ -- cgit v1.2.3 From 99bac36667b6b20b9b0a20dc976365d23f90628b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 16:58:51 -0500 Subject: lockdep: move held_lock to lockdep_types.h held_lock is embedded in task_struct, and we don't want sched.h pulling in all of lockdep.h Signed-off-by: Kent Overstreet Acked-by: Waiman Long --- include/linux/lockdep.h | 57 ------------------------------------------- include/linux/lockdep_types.h | 57 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 57 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index dc2844b071c2c..08b0d1d9d78b7 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -82,63 +82,6 @@ struct lock_chain { u64 chain_key; }; -#define MAX_LOCKDEP_KEYS_BITS 13 -#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) -#define INITIAL_CHAIN_KEY -1 - -struct held_lock { - /* - * One-way hash of the dependency chain up to this point. We - * hash the hashes step by step as the dependency chain grows. - * - * We use it for dependency-caching and we skip detection - * passes and dependency-updates if there is a cache-hit, so - * it is absolutely critical for 100% coverage of the validator - * to have a unique key value for every unique dependency path - * that can occur in the system, to make a unique hash value - * as likely as possible - hence the 64-bit width. - * - * The task struct holds the current hash value (initialized - * with zero), here we store the previous hash value: - */ - u64 prev_chain_key; - unsigned long acquire_ip; - struct lockdep_map *instance; - struct lockdep_map *nest_lock; -#ifdef CONFIG_LOCK_STAT - u64 waittime_stamp; - u64 holdtime_stamp; -#endif - /* - * class_idx is zero-indexed; it points to the element in - * lock_classes this held lock instance belongs to. class_idx is in - * the range from 0 to (MAX_LOCKDEP_KEYS-1) inclusive. - */ - unsigned int class_idx:MAX_LOCKDEP_KEYS_BITS; - /* - * The lock-stack is unified in that the lock chains of interrupt - * contexts nest ontop of process context chains, but we 'separate' - * the hashes by starting with 0 if we cross into an interrupt - * context, and we also keep do not add cross-context lock - * dependencies - the lock usage graph walking covers that area - * anyway, and we'd just unnecessarily increase the number of - * dependencies otherwise. [Note: hardirq and softirq contexts - * are separated from each other too.] - * - * The following field is used to detect when we cross into an - * interrupt context: - */ - unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */ - unsigned int trylock:1; /* 16 bits */ - - unsigned int read:2; /* see lock_acquire() comment */ - unsigned int check:1; /* see lock_acquire() comment */ - unsigned int hardirqs_off:1; - unsigned int sync:1; - unsigned int references:11; /* 32 bits */ - unsigned int pin_count; -}; - /* * Initialization, self-test and debugging-output methods: */ diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h index 2ebc323d345ae..9c533c8d701e7 100644 --- a/include/linux/lockdep_types.h +++ b/include/linux/lockdep_types.h @@ -198,6 +198,63 @@ struct lockdep_map { struct pin_cookie { unsigned int val; }; +#define MAX_LOCKDEP_KEYS_BITS 13 +#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) +#define INITIAL_CHAIN_KEY -1 + +struct held_lock { + /* + * One-way hash of the dependency chain up to this point. We + * hash the hashes step by step as the dependency chain grows. + * + * We use it for dependency-caching and we skip detection + * passes and dependency-updates if there is a cache-hit, so + * it is absolutely critical for 100% coverage of the validator + * to have a unique key value for every unique dependency path + * that can occur in the system, to make a unique hash value + * as likely as possible - hence the 64-bit width. + * + * The task struct holds the current hash value (initialized + * with zero), here we store the previous hash value: + */ + u64 prev_chain_key; + unsigned long acquire_ip; + struct lockdep_map *instance; + struct lockdep_map *nest_lock; +#ifdef CONFIG_LOCK_STAT + u64 waittime_stamp; + u64 holdtime_stamp; +#endif + /* + * class_idx is zero-indexed; it points to the element in + * lock_classes this held lock instance belongs to. class_idx is in + * the range from 0 to (MAX_LOCKDEP_KEYS-1) inclusive. + */ + unsigned int class_idx:MAX_LOCKDEP_KEYS_BITS; + /* + * The lock-stack is unified in that the lock chains of interrupt + * contexts nest ontop of process context chains, but we 'separate' + * the hashes by starting with 0 if we cross into an interrupt + * context, and we also keep do not add cross-context lock + * dependencies - the lock usage graph walking covers that area + * anyway, and we'd just unnecessarily increase the number of + * dependencies otherwise. [Note: hardirq and softirq contexts + * are separated from each other too.] + * + * The following field is used to detect when we cross into an + * interrupt context: + */ + unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */ + unsigned int trylock:1; /* 16 bits */ + + unsigned int read:2; /* see lock_acquire() comment */ + unsigned int check:1; /* see lock_acquire() comment */ + unsigned int hardirqs_off:1; + unsigned int sync:1; + unsigned int references:11; /* 32 bits */ + unsigned int pin_count; +}; + #else /* !CONFIG_LOCKDEP */ /* -- cgit v1.2.3 From cba6167f0adb07b6cd1b8758dd67718c772e108c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Dec 2023 17:00:41 -0500 Subject: restart_block: Trim includes We don't actually use any timekeeping types, no need to pull in time64.h. Also, sched.h uses restart_block; add it as a direct dependency. Signed-off-by: Kent Overstreet --- include/linux/restart_block.h | 2 +- include/linux/sched.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index 980a65594412d..13f17676c5f49 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -7,8 +7,8 @@ #include #include -#include +struct __kernel_timespec; struct timespec; struct old_timespec32; struct pollfd; diff --git a/include/linux/sched.h b/include/linux/sched.h index d341a6c302d65..dd002d1937268 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From a7e7b40c4bc115dbf2a2bb453d7bbb2e0ea99703 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Fri, 15 Dec 2023 19:31:14 -0800 Subject: net/mlx5e: Use the correct lag ports number when creating TISes The cited commit moved the code of mlx5e_create_tises() and changed the loop to create TISes over MLX5_MAX_PORTS constant value, instead of getting the correct lag ports supported by the device, which can cause FW errors on devices with less than MLX5_MAX_PORTS ports. Change that back to mlx5e_get_num_lag_ports(mdev). Also IPoIB interfaces create there own TISes, they don't use the eth TISes, pass a flag to indicate that. Fixes: b25bd37c859f ("net/mlx5: Move TISes from priv to mdev HW resources") Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 7ee5b79ff3d60..aafb36c9e5d9d 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -681,6 +681,7 @@ struct mlx5e_resources { struct mlx5_sq_bfreg bfreg; #define MLX5_MAX_NUM_TC 8 u32 tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC]; + bool tisn_valid; } hw_objs; struct net_device *uplink_netdev; struct mutex uplink_netdev_lock; -- cgit v1.2.3 From e04984a37398b3f4f5a79c993b94c6b1224184cc Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 19 Dec 2023 14:46:20 +0200 Subject: net/mlx5: Fix query of sd_group field The sd_group field moved in the HW spec from the MPIR register to the vport context. Align the query accordingly. Fixes: f5e956329960 ("net/mlx5: Expose Management PCIe Index Register (MPIR)") Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 10 +++++++--- include/linux/mlx5/vport.h | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index fee20fc010c2d..bf2d51952e48a 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4030,8 +4030,13 @@ struct mlx5_ifc_nic_vport_context_bits { u8 affiliation_criteria[0x4]; u8 affiliated_vhca_id[0x10]; - u8 reserved_at_60[0xd0]; + u8 reserved_at_60[0xa0]; + u8 reserved_at_100[0x1]; + u8 sd_group[0x3]; + u8 reserved_at_104[0x1c]; + + u8 reserved_at_120[0x10]; u8 mtu[0x10]; u8 system_image_guid[0x40]; @@ -10116,8 +10121,7 @@ struct mlx5_ifc_mpir_reg_bits { u8 reserved_at_20[0x20]; u8 local_port[0x8]; - u8 reserved_at_28[0x15]; - u8 sd_group[0x3]; + u8 reserved_at_28[0x18]; u8 reserved_at_60[0x20]; }; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index fbb9bf4478894..c36cc6d829267 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -72,6 +72,7 @@ int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu); int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu); int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev, u64 *system_image_guid); +int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group); int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid); int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev, u16 vport, u64 node_guid); -- cgit v1.2.3 From c88c49ac9c18fb7c3fa431126de1d8f8f555e912 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 5 Dec 2023 23:54:21 +0200 Subject: net/mlx5: Enable SD feature Have an actual mlx5_sd instance in the core device, and fix the getter accordingly. This allows SD stuff to flow, the feature becomes supported only here. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index aafb36c9e5d9d..cd286b681970f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -822,6 +822,7 @@ struct mlx5_core_dev { struct blocking_notifier_head macsec_nh; #endif u64 num_ipsec_offloads; + struct mlx5_sd *sd; }; struct mlx5_db { -- cgit v1.2.3 From 22c4640698a1d47606b5a4264a584e8046641784 Mon Sep 17 00:00:00 2001 From: Armen Ratner Date: Fri, 8 Sep 2023 14:53:09 -0500 Subject: net/mlx5: Implement management PF Ethernet profile Add management PF modules, which introduce support for the structures needed to create the resources for the MGMT PF to work. Also, add the necessary calls and functions to establish this functionality. Signed-off-by: Armen Ratner Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Reviewed-by: Daniel Jurgens --- include/linux/mlx5/driver.h | 8 ++++++++ include/linux/mlx5/mlx5_ifc.h | 14 +++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index cd286b681970f..2bba88c67f583 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1224,6 +1224,14 @@ static inline bool mlx5_core_is_ecpf(const struct mlx5_core_dev *dev) return dev->caps.embedded_cpu; } +static inline bool mlx5_core_is_mgmt_pf(const struct mlx5_core_dev *dev) +{ + if (!MLX5_CAP_GEN_2(dev, local_mng_port_valid)) + return false; + + return MLX5_CAP_GEN_2(dev, local_mng_port); +} + static inline bool mlx5_core_is_ecpf_esw_manager(const struct mlx5_core_dev *dev) { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index bf2d51952e48a..5865692092544 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1954,8 +1954,10 @@ enum { struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_0[0x80]; - u8 migratable[0x1]; - u8 reserved_at_81[0x1f]; + u8 migratable[0x1]; + u8 reserved_at_81[0x19]; + u8 local_mng_port[0x1]; + u8 reserved_at_9b[0x5]; u8 max_reformat_insert_size[0x8]; u8 max_reformat_insert_offset[0x8]; @@ -1973,7 +1975,13 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 allowed_object_for_other_vhca_access[0x40]; - u8 reserved_at_140[0x60]; + u8 reserved_at_140[0x20]; + + u8 reserved_at_160[0xa]; + u8 local_mng_port_valid[0x1]; + u8 reserved_at_16b[0x15]; + + u8 reserved_at_180[0x20]; u8 flow_table_type_2_type[0x8]; u8 reserved_at_1a8[0x3]; -- cgit v1.2.3 From ee9ec49046951eff704752669f0c388b506ddbdf Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Tue, 19 Dec 2023 18:50:02 +0100 Subject: iio: buffer-dma: Get rid of outgoing queue The buffer-dma code was using two queues, incoming and outgoing, to manage the state of the blocks in use. While this totally works, it adds some complexity to the code, especially since the code only manages 2 blocks. It is much easier to just check each block's state manually, and keep a counter for the next block to dequeue. Since the new DMABUF based API wouldn't use the outgoing queue anyway, getting rid of it now makes the upcoming changes simpler. With this change, the IIO_BLOCK_STATE_DEQUEUED is now useless, and can be removed. Signed-off-by: Paul Cercueil Link: https://lore.kernel.org/r/20231219175009.65482-2-paul@crapouillou.net Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer-dma.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/buffer-dma.h b/include/linux/iio/buffer-dma.h index 6564bdcdac66c..18d3702fa95d1 100644 --- a/include/linux/iio/buffer-dma.h +++ b/include/linux/iio/buffer-dma.h @@ -19,14 +19,12 @@ struct device; /** * enum iio_block_state - State of a struct iio_dma_buffer_block - * @IIO_BLOCK_STATE_DEQUEUED: Block is not queued * @IIO_BLOCK_STATE_QUEUED: Block is on the incoming queue * @IIO_BLOCK_STATE_ACTIVE: Block is currently being processed by the DMA * @IIO_BLOCK_STATE_DONE: Block is on the outgoing queue * @IIO_BLOCK_STATE_DEAD: Block has been marked as to be freed */ enum iio_block_state { - IIO_BLOCK_STATE_DEQUEUED, IIO_BLOCK_STATE_QUEUED, IIO_BLOCK_STATE_ACTIVE, IIO_BLOCK_STATE_DONE, @@ -73,12 +71,15 @@ struct iio_dma_buffer_block { * @active_block: Block being used in read() * @pos: Read offset in the active block * @block_size: Size of each block + * @next_dequeue: index of next block that will be dequeued */ struct iio_dma_buffer_queue_fileio { struct iio_dma_buffer_block *blocks[2]; struct iio_dma_buffer_block *active_block; size_t pos; size_t block_size; + + unsigned int next_dequeue; }; /** @@ -93,7 +94,6 @@ struct iio_dma_buffer_queue_fileio { * list and typically also a list of active blocks in the part that handles * the DMA controller * @incoming: List of buffers on the incoming queue - * @outgoing: List of buffers on the outgoing queue * @active: Whether the buffer is currently active * @fileio: FileIO state */ @@ -105,7 +105,6 @@ struct iio_dma_buffer_queue { struct mutex lock; spinlock_t list_lock; struct list_head incoming; - struct list_head outgoing; bool active; -- cgit v1.2.3 From 3efdc78fdc21ab82694707eb234ab93f28d13ba8 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 13 Dec 2023 22:44:38 -0800 Subject: fs/proc: show correct device and inode numbers in /proc/pid/maps /proc/pid/maps shows device and inode numbers of vma->vm_file-s. Here is an issue. If a mapped file is on a stackable file system (e.g., overlayfs), vma->vm_file is a backing file whose f_inode is on the underlying filesystem. To show correct numbers, we need to get a user file and shows its numbers. The same trick is used to show file paths in /proc/pid/maps. Cc: Alexander Mikhalitsyn Suggested-by: Amir Goldstein Signed-off-by: Andrei Vagin Link: https://lore.kernel.org/r/20231214064439.1023011-1-avagin@google.com Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- include/linux/fs.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index f171505940ff7..a3a48a5d87281 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2523,20 +2523,28 @@ struct file *backing_file_open(const struct path *user_path, int flags, struct path *backing_file_user_path(struct file *f); /* - * file_user_path - get the path to display for memory mapped file - * * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file * stored in ->vm_file is a backing file whose f_inode is on the underlying - * filesystem. When the mapped file path is displayed to user (e.g. via - * /proc//maps), this helper should be used to get the path to display - * to the user, which is the path of the fd that user has requested to map. + * filesystem. When the mapped file path and inode number are displayed to + * user (e.g. via /proc//maps), these helpers should be used to get the + * path and inode number to display to the user, which is the path of the fd + * that user has requested to map and the inode number that would be returned + * by fstat() on that same fd. */ +/* Get the path to display in /proc//maps */ static inline const struct path *file_user_path(struct file *f) { if (unlikely(f->f_mode & FMODE_BACKING)) return backing_file_user_path(f); return &f->f_path; } +/* Get the inode whose inode number to display in /proc//maps */ +static inline const struct inode *file_user_inode(struct file *f) +{ + if (unlikely(f->f_mode & FMODE_BACKING)) + return d_inode(backing_file_user_path(f)->dentry); + return file_inode(f); +} static inline struct file *file_clone_open(struct file *file) { -- cgit v1.2.3 From 5ae81209491ed3718fee798db6fb2cc81214824c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 19 Dec 2023 14:52:36 +0100 Subject: driver core: bus: make bus_sort_breadthfirst() take a const pointer For some reason, during the big "clean up the driver core for a const struct bus_type" work, the bus_sort_breadthfirst() call was missed. Fix this up by changing the type to be a const * as it should be. Cc: Rafael J. Wysocki Link: https://lore.kernel.org/r/2023121935-stinking-ditzy-fd5d@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/device/bus.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index ae10c43227543..25127f7503494 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -232,7 +232,7 @@ bus_find_device_by_acpi_dev(const struct bus_type *bus, const void *adev) int bus_for_each_drv(const struct bus_type *bus, struct device_driver *start, void *data, int (*fn)(struct device_driver *, void *)); -void bus_sort_breadthfirst(struct bus_type *bus, +void bus_sort_breadthfirst(const struct bus_type *bus, int (*compare)(const struct device *a, const struct device *b)); /* -- cgit v1.2.3 From 32f78abe59c740b6ec34c89dc10a09208eae7e1f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 19 Dec 2023 14:15:09 +0100 Subject: driver core: bus: constantify subsys_register() calls The functions subsys_register() and subsys_virtual_register() should be taking a constant pointer to a struct bus_type, as they do not actually modify anything in it, so fix up the function definitions to do so properly. This also changes the pointer type in struct subsys_interface to be constant as well, as again, that's the proper signature of it. Cc: Rafael J. Wysocki Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/2023121908-grove-genetics-f8af@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 4aa34c8d13610..aefc5ca7f1cfc 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -63,7 +63,7 @@ struct msi_device_data; */ struct subsys_interface { const char *name; - struct bus_type *subsys; + const struct bus_type *subsys; struct list_head node; int (*add_dev)(struct device *dev, struct subsys_interface *sif); void (*remove_dev)(struct device *dev, struct subsys_interface *sif); @@ -72,9 +72,9 @@ struct subsys_interface { int subsys_interface_register(struct subsys_interface *sif); void subsys_interface_unregister(struct subsys_interface *sif); -int subsys_system_register(struct bus_type *subsys, +int subsys_system_register(const struct bus_type *subsys, const struct attribute_group **groups); -int subsys_virtual_register(struct bus_type *subsys, +int subsys_virtual_register(const struct bus_type *subsys, const struct attribute_group **groups); /* -- cgit v1.2.3 From dedb868994d8308c6c4650203e190ec619005806 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 19 Dec 2023 16:03:20 +0100 Subject: driver core: container: make container_subsys const Now that the driver core can properly handle constant struct bus_type, move the container_subsys variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/2023121919-chatter-grumbling-9ef3@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/container.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/container.h b/include/linux/container.h index 2566a1baa736a..dd00cc918a926 100644 --- a/include/linux/container.h +++ b/include/linux/container.h @@ -12,7 +12,7 @@ #include /* drivers/base/power/container.c */ -extern struct bus_type container_subsys; +extern const struct bus_type container_subsys; struct container_dev { struct device dev; -- cgit v1.2.3 From 39299bdd2546688d92ed9db4948f6219ca1b9542 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 9 Dec 2023 00:41:55 +0000 Subject: keys, dns: Allow key types (eg. DNS) to be reclaimed immediately on expiry If a key has an expiration time, then when that time passes, the key is left around for a certain amount of time before being collected (5 mins by default) so that EKEYEXPIRED can be returned instead of ENOKEY. This is a problem for DNS keys because we want to redo the DNS lookup immediately at that point. Fix this by allowing key types to be marked such that keys of that type don't have this extra period, but are reclaimed as soon as they expire and turn this on for dns_resolver-type keys. To make this easier to handle, key->expiry is changed to be permanent if TIME64_MAX rather than 0. Furthermore, give such new-style negative DNS results a 1s default expiry if no other expiry time is set rather than allowing it to stick around indefinitely. This shouldn't be zero as ls will follow a failing stat call immediately with a second with AT_SYMLINK_NOFOLLOW added. Fixes: 1a4240f4764a ("DNS: Separate out CIFS DNS Resolver code") Signed-off-by: David Howells Tested-by: Markus Suvanto cc: Wang Lei cc: Jeff Layton cc: Steve French cc: Marc Dionne cc: Jarkko Sakkinen cc: "David S. Miller" cc: Eric Dumazet cc: Jakub Kicinski cc: Paolo Abeni cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: linux-nfs@vger.kernel.org cc: ceph-devel@vger.kernel.org cc: keyrings@vger.kernel.org cc: netdev@vger.kernel.org --- include/linux/key-type.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/key-type.h b/include/linux/key-type.h index 7d985a1dfe4af..5caf3ce823733 100644 --- a/include/linux/key-type.h +++ b/include/linux/key-type.h @@ -73,6 +73,7 @@ struct key_type { unsigned int flags; #define KEY_TYPE_NET_DOMAIN 0x00000001 /* Keys of this type have a net namespace domain */ +#define KEY_TYPE_INSTANT_REAP 0x00000002 /* Keys of this type don't have a delay after expiring */ /* vet a description */ int (*vet_description)(const char *description); -- cgit v1.2.3 From 4515d08a742c76612b65d2f47a87d12860519842 Mon Sep 17 00:00:00 2001 From: Marco Pagani Date: Thu, 21 Dec 2023 17:58:47 +0100 Subject: kernel/module: improve documentation for try_module_get() The sentence "this call will fail if the module is already being removed" is potentially confusing and may contradict the rest of the documentation. If one tries to get a module that has already been removed using a stale pointer, the kernel will crash. Signed-off-by: Marco Pagani Signed-off-by: Luis Chamberlain --- include/linux/module.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index a98e188cf37b8..08364d5cbc079 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -668,7 +668,7 @@ extern void __module_get(struct module *module); * @module: the module we should check for * * Only try to get a module reference count if the module is not being removed. - * This call will fail if the module is already being removed. + * This call will fail if the module is in the process of being removed. * * Care must also be taken to ensure the module exists and is alive prior to * usage of this call. This can be gauranteed through two means: -- cgit v1.2.3 From 645f3d85129d8aac3b896ba685fbc20a31c2c036 Mon Sep 17 00:00:00 2001 From: Mukesh Sisodiya Date: Wed, 20 Dec 2023 13:41:38 +0200 Subject: wifi: cfg80211: handle UHB AP and STA power type UHB AP send supported power type(LPI, SP, VLP) in beacon and probe response IE and STA should connect to these AP only if their regulatory support the AP power type. Beacon/Probe response are reported to userspace with reason "STA regulatory not supporting to connect to AP based on transmitted power type" and it should not connect to AP. Signed-off-by: Mukesh Sisodiya Reviewed-by: Gregory Greenman Signed-off-by: Miri Korenblit Link: https://msgid.link/20231220133549.cbfbef9170a9.I432f78438de18aa9f5c9006be12e41dc34cc47c5@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 8ad008591e320..2f55544820477 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2720,6 +2720,7 @@ static inline bool ieee80211_he_capa_size_ok(const u8 *data, u8 len) #define IEEE80211_6GHZ_CTRL_REG_LPI_AP 0 #define IEEE80211_6GHZ_CTRL_REG_SP_AP 1 +#define IEEE80211_6GHZ_CTRL_REG_VLP_AP 2 /** * struct ieee80211_he_6ghz_oper - HE 6 GHz operation Information field -- cgit v1.2.3 From d68019471995ba47e56a9da355df13a1cdb5bf7e Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 18 Dec 2023 08:45:18 +0100 Subject: entry: Move exit to usermode functions to header file To allow inlining, move exit_to_user_mode() to entry-common.h. Signed-off-by: Sven Schnelle Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20231218074520.1998026-2-svens@linux.ibm.com --- include/linux/entry-common.h | 53 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index d95ab85f96ba5..6a6e98f3805fe 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -7,6 +7,10 @@ #include #include #include +#include +#include +#include +#include #include @@ -258,6 +262,43 @@ static __always_inline void arch_exit_to_user_mode(void) { } */ void arch_do_signal_or_restart(struct pt_regs *regs); +/** + * exit_to_user_mode_loop - do any pending work before leaving to user space + */ +unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + unsigned long ti_work); + +/** + * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required + * @regs: Pointer to pt_regs on entry stack + * + * 1) check that interrupts are disabled + * 2) call tick_nohz_user_enter_prepare() + * 3) call exit_to_user_mode_loop() if any flags from + * EXIT_TO_USER_MODE_WORK are set + * 4) check that interrupts are still disabled + */ +static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) +{ + unsigned long ti_work; + + lockdep_assert_irqs_disabled(); + + /* Flush pending rcuog wakeup before the last need_resched() check */ + tick_nohz_user_enter_prepare(); + + ti_work = read_thread_flags(); + if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) + ti_work = exit_to_user_mode_loop(regs, ti_work); + + arch_exit_to_user_mode_prepare(regs, ti_work); + + /* Ensure that kernel state is sane for a return to userspace */ + kmap_assert_nomap(); + lockdep_assert_irqs_disabled(); + lockdep_sys_exit(); +} + /** * exit_to_user_mode - Fixup state when exiting to user mode * @@ -276,7 +317,17 @@ void arch_do_signal_or_restart(struct pt_regs *regs); * non-instrumentable. * The caller has to invoke syscall_exit_to_user_mode_work() before this. */ -void exit_to_user_mode(void); +static __always_inline void exit_to_user_mode(void) +{ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(); + instrumentation_end(); + + user_enter_irqoff(); + arch_exit_to_user_mode(); + lockdep_hardirqs_on(CALLER_ADDR0); +} /** * syscall_exit_to_user_mode_work - Handle work before returning to user mode -- cgit v1.2.3 From caf4062e35b21cd7d3d35ac2f58f9765d02d32a0 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 18 Dec 2023 08:45:19 +0100 Subject: entry: Move enter_from_user_mode() to header file To allow inlining of enter_from_user_mode(), move it to entry-common.h. Signed-off-by: Sven Schnelle Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20231218074520.1998026-3-svens@linux.ibm.com --- include/linux/entry-common.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 6a6e98f3805fe..c4205390448ee 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -102,7 +103,19 @@ static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) {} * done between establishing state and enabling interrupts. The caller must * enable interrupts before invoking syscall_enter_from_user_mode_work(). */ -void enter_from_user_mode(struct pt_regs *regs); +static __always_inline void enter_from_user_mode(struct pt_regs *regs) +{ + arch_enter_from_user_mode(regs); + lockdep_hardirqs_off(CALLER_ADDR0); + + CT_WARN_ON(__ct_state() != CONTEXT_USER); + user_exit_irqoff(); + + instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); + trace_hardirqs_off_finish(); + instrumentation_end(); +} /** * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts -- cgit v1.2.3 From 221a164035fd8b554a44bd7c4bf8e7715a497561 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 18 Dec 2023 08:45:20 +0100 Subject: entry: Move syscall_enter_from_user_mode() to header file To allow inlining of syscall_enter_from_user_mode(), move it to entry-common.h. Signed-off-by: Sven Schnelle Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20231218074520.1998026-4-svens@linux.ibm.com --- include/linux/entry-common.h | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index c4205390448ee..b0fb775a600d9 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -134,6 +134,9 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs) */ void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); +long syscall_trace_enter(struct pt_regs *regs, long syscall, + unsigned long work); + /** * syscall_enter_from_user_mode_work - Check and handle work before invoking * a syscall @@ -157,7 +160,15 @@ void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); * ptrace_report_syscall_entry(), __secure_computing(), trace_sys_enter() * 2) Invocation of audit_syscall_entry() */ -long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall); +static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall) +{ + unsigned long work = READ_ONCE(current_thread_info()->syscall_work); + + if (work & SYSCALL_WORK_ENTER) + syscall = syscall_trace_enter(regs, syscall, work); + + return syscall; +} /** * syscall_enter_from_user_mode - Establish state and check and handle work @@ -176,7 +187,19 @@ long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall); * Returns: The original or a modified syscall number. See * syscall_enter_from_user_mode_work() for further explanation. */ -long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall); +static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) +{ + long ret; + + enter_from_user_mode(regs); + + instrumentation_begin(); + local_irq_enable(); + ret = syscall_enter_from_user_mode_work(regs, syscall); + instrumentation_end(); + + return ret; +} /** * local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable() -- cgit v1.2.3 From bbcd80f53a5e8c27c2511f539fec8c373f500cf4 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 15 Dec 2023 13:32:05 +0100 Subject: mtd: rawnand: Prevent crossing LUN boundaries during sequential reads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ONFI specification states that devices do not need to support sequential reads across LUN boundaries. In order to prevent such event from happening and possibly failing, let's introduce the concept of "pause" in the sequential read to handle these cases. The first/last pages remain the same but any time we cross a LUN boundary we will end and restart (if relevant) the sequential read operation. Cc: stable@vger.kernel.org Fixes: 003fe4b9545b ("mtd: rawnand: Support for sequential cache reads") Signed-off-by: Miquel Raynal Tested-by: Martin Hundebøll Link: https://lore.kernel.org/linux-mtd/20231215123208.516590-2-miquel.raynal@bootlin.com --- include/linux/mtd/rawnand.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index c29ace15a053a..9d0fc5109af66 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -1265,6 +1265,7 @@ struct nand_secure_region { * @cont_read: Sequential page read internals * @cont_read.ongoing: Whether a continuous read is ongoing or not * @cont_read.first_page: Start of the continuous read operation + * @cont_read.pause_page: End of the current sequential cache read operation * @cont_read.last_page: End of the continuous read operation * @controller: The hardware controller structure which is shared among multiple * independent devices @@ -1321,6 +1322,7 @@ struct nand_chip { struct { bool ongoing; unsigned int first_page; + unsigned int pause_page; unsigned int last_page; } cont_read; -- cgit v1.2.3 From 8e6e83d77227d9ba39e0c7b50693f1b4f8728006 Mon Sep 17 00:00:00 2001 From: Kundan Kumar Date: Fri, 22 Dec 2023 15:47:07 +0530 Subject: block: skip start/end time stamping for passthrough IO commit 41fa722239b4 ("blk-mq: do not include passthrough requests in I/O accounting")' disables I/O accounting for passthrough requests. Since tools like 'iostat' do not show anything useful for passthrough I/O, it's wasteful to do start/end time-stamping. So do away with that. Avoiding the time-stamping improves the I/O performance by ~7% Signed-off-by: Kundan Kumar Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20231222101707.6921-1-kundan.kumar@samsung.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1ab3081c82eda..a676e116085f3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -830,6 +830,12 @@ void blk_mq_end_request_batch(struct io_comp_batch *ib); */ static inline bool blk_mq_need_time_stamp(struct request *rq) { + /* + * passthrough io doesn't use iostat accounting, cgroup stats + * and io scheduler functionalities. + */ + if (blk_rq_is_passthrough(rq)) + return false; return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED)); } -- cgit v1.2.3 From 60e43fe5285e2077ce9904d78cd42a230d03b788 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:02:31 -0700 Subject: lib/firmware_table: tables: Add CDAT table parsing support The CDAT table is very similar to ACPI tables when it comes to sub-table and entry structures. The helper functions can be also used to parse the CDAT table. Add support to the helper functions to deal with an external CDAT table, and also handle the endieness since CDAT can be processed by a BE host. Export a function cdat_table_parse() for CXL driver to parse a CDAT table. In order to minimize ACPICA code changes, __force is being utilized to deal with the case of a big endian (BE) host parsing a CDAT. All CDAT data structure variables are being force casted to __leX as appropriate. Cc: Rafael J. Wysocki Cc: Len Brown Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/170319615131.2212653.10932785667981494238.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- include/linux/fw_table.h | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fw_table.h b/include/linux/fw_table.h index ca49947f0a775..95421860397a2 100644 --- a/include/linux/fw_table.h +++ b/include/linux/fw_table.h @@ -25,16 +25,35 @@ struct acpi_subtable_proc { int count; }; +union fw_table_header { + struct acpi_table_header acpi; + struct acpi_table_cdat cdat; +}; + union acpi_subtable_headers { struct acpi_subtable_header common; struct acpi_hmat_structure hmat; struct acpi_prmt_module_header prmt; struct acpi_cedt_header cedt; + struct acpi_cdat_header cdat; }; int acpi_parse_entries_array(char *id, unsigned long table_size, - struct acpi_table_header *table_header, + union fw_table_header *table_header, struct acpi_subtable_proc *proc, int proc_num, unsigned int max_entries); +int cdat_table_parse(enum acpi_cdat_type type, + acpi_tbl_entry_handler_arg handler_arg, void *arg, + struct acpi_table_cdat *table_header); + +/* CXL is the only non-ACPI consumer of the FIRMWARE_TABLE library */ +#if IS_ENABLED(CONFIG_ACPI) && !IS_ENABLED(CONFIG_CXL_BUS) +#define EXPORT_SYMBOL_FWTBL_LIB(x) EXPORT_SYMBOL_ACPI_LIB(x) +#define __init_or_fwtbl_lib __init_or_acpilib +#else +#define EXPORT_SYMBOL_FWTBL_LIB(x) EXPORT_SYMBOL_NS_GPL(x, CXL) +#define __init_or_fwtbl_lib +#endif + #endif -- cgit v1.2.3 From 6a954e94d038f41d79c4e04348c95774d1c9337d Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:02:37 -0700 Subject: base/node / acpi: Change 'node_hmem_attrs' to 'access_coordinates' Dan Williams suggested changing the struct 'node_hmem_attrs' to 'access_coordinates' [1]. The struct is a container of r/w-latency and r/w-bandwidth numbers. Moving forward, this container will also be used by CXL to store the performance characteristics of each link hop in the PCIE/CXL topology. So, where node_hmem_attrs is just the access parameters of a memory-node, access_coordinates applies more broadly to hardware topology characteristics. The observation is that seemed like an exercise in having the application identify "where" it falls on a spectrum of bandwidth and latency needs. For the tuple of read/write-latency and read/write-bandwidth, "coordinates" is not a perfect fit. Sometimes it is just conveying values in isolation and not a "location" relative to other performance points, but in the end this data is used to identify the performance operation point of a given memory-node. [2] Link: http://lore.kernel.org/r/64471313421f7_1b66294d5@dwillia2-xfh.jf.intel.com.notmuch/ Link: https://lore.kernel.org/linux-cxl/645e6215ee0de_1e6f2945e@dwillia2-xfh.jf.intel.com.notmuch/ Suggested-by: Dan Williams Reviewed-by: Dan Williams Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/170319615734.2212653.15319394025985499185.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- include/linux/memory-tiers.h | 10 +++++----- include/linux/node.h | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 1e39d27bee418..69e7819000827 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -33,7 +33,7 @@ struct memory_dev_type { struct kref kref; }; -struct node_hmem_attrs; +struct access_coordinate; #ifdef CONFIG_NUMA extern bool numa_demotion_enabled; @@ -45,9 +45,9 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype); int register_mt_adistance_algorithm(struct notifier_block *nb); int unregister_mt_adistance_algorithm(struct notifier_block *nb); int mt_calc_adistance(int node, int *adist); -int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf, +int mt_set_default_dram_perf(int nid, struct access_coordinate *perf, const char *source); -int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist); +int mt_perf_to_adistance(struct access_coordinate *perf, int *adist); #ifdef CONFIG_MIGRATION int next_demotion_node(int node); void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets); @@ -126,13 +126,13 @@ static inline int mt_calc_adistance(int node, int *adist) return NOTIFY_DONE; } -static inline int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf, +static inline int mt_set_default_dram_perf(int nid, struct access_coordinate *perf, const char *source) { return -EIO; } -static inline int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist) +static inline int mt_perf_to_adistance(struct access_coordinate *perf, int *adist) { return -EIO; } diff --git a/include/linux/node.h b/include/linux/node.h index 427a5975cf405..25b66d705ee2e 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -20,14 +20,14 @@ #include /** - * struct node_hmem_attrs - heterogeneous memory performance attributes + * struct access_coordinate - generic performance coordinates container * * @read_bandwidth: Read bandwidth in MB/s * @write_bandwidth: Write bandwidth in MB/s * @read_latency: Read latency in nanoseconds * @write_latency: Write latency in nanoseconds */ -struct node_hmem_attrs { +struct access_coordinate { unsigned int read_bandwidth; unsigned int write_bandwidth; unsigned int read_latency; @@ -65,7 +65,7 @@ struct node_cache_attrs { #ifdef CONFIG_HMEM_REPORTING void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs); -void node_set_perf_attrs(unsigned int nid, struct node_hmem_attrs *hmem_attrs, +void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord, unsigned access); #else static inline void node_add_cache(unsigned int nid, @@ -74,7 +74,7 @@ static inline void node_add_cache(unsigned int nid, } static inline void node_set_perf_attrs(unsigned int nid, - struct node_hmem_attrs *hmem_attrs, + struct access_coordinate *coord, unsigned access) { } -- cgit v1.2.3 From ca53543d8e340070fb37fde93f36ed9012c76b90 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:03:07 -0700 Subject: acpi: numa: Add helper function to retrieve the performance attributes Add helper to retrieve the performance attributes based on the device handle. The helper function is exported so the CXL driver can use that to acquire the performance data between the CPU and the CXL host bridge. Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/170319618721.2212653.5552947472849081786.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- include/linux/acpi.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4db54e928b36d..8b0761c682f99 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -15,6 +15,7 @@ #include #include #include +#include struct irq_domain; struct irq_domain_ops; @@ -424,6 +425,16 @@ extern int acpi_blacklisted(void); extern void acpi_osi_setup(char *str); extern bool acpi_osi_is_win8(void); +#ifdef CONFIG_ACPI_HMAT +int acpi_get_genport_coordinates(u32 uid, struct access_coordinate *coord); +#else +static inline int acpi_get_genport_coordinates(u32 uid, + struct access_coordinate *coord) +{ + return -EOPNOTSUPP; +} +#endif + #ifdef CONFIG_ACPI_NUMA int acpi_map_pxm_to_node(int pxm); int acpi_get_node(acpi_handle handle); -- cgit v1.2.3 From 4d07a05397c8c15c37c8c3abb7afaea1dcd2f0e7 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 21 Dec 2023 15:03:39 -0700 Subject: cxl: Calculate and store PCI link latency for the downstream ports The latency is calculated by dividing the flit size over the bandwidth. Add support to retrieve the flit size for the CXL switch device and calculate the latency of the PCIe link. Cache the latency number with cxl_dport. Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/170319621931.2212653.6800240203604822886.stgit@djiang5-mobl3 Signed-off-by: Dan Williams --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index dea043bc1e383..504a4ba2c29ef 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1364,6 +1364,7 @@ int pcie_set_mps(struct pci_dev *dev, int mps); u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev, enum pci_bus_speed *speed, enum pcie_link_width *width); +int pcie_link_speed_mbps(struct pci_dev *pdev); void pcie_print_link_status(struct pci_dev *dev); int pcie_reset_flr(struct pci_dev *dev, bool probe); int pcie_flr(struct pci_dev *dev); -- cgit v1.2.3 From dcc3e46472d678f4af5ce1194a23649231c5d241 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Mon, 18 Dec 2023 17:26:26 -0700 Subject: net: skbuff: Remove some excess struct-member documentation Remove documentation for nonexistent structure members, addressing these warnings: ./include/linux/skbuff.h:1063: warning: Excess struct member 'sp' description in 'sk_buff' ./include/linux/skbuff.h:1063: warning: Excess struct member 'nf_bridge' description in 'sk_buff' Signed-off-by: Jonathan Corbet Reviewed-by: Randy Dunlap Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ea5c8ab3ed00d..50e92c8471dc7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -754,7 +754,6 @@ typedef unsigned char *sk_buff_data_t; * @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL * @cb: Control buffer. Free for use by every layer. Put private vars here * @_skb_refdst: destination entry (with norefcount bit) - * @sp: the security path, used for xfrm * @len: Length of actual data * @data_len: Data length * @mac_len: Length of link layer header @@ -788,7 +787,6 @@ typedef unsigned char *sk_buff_data_t; * @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue) * @_sk_redir: socket redirection information for skmsg * @_nfct: Associated connection, if any (with nfctinfo bits) - * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c * @skb_iif: ifindex of device we arrived on * @tc_index: Traffic control index * @hash: the packet hash -- cgit v1.2.3 From 520adf3ba4a4bdd41450c57b17ef01f8a069fbfe Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 22 Dec 2023 21:05:22 -0800 Subject: driver core: class: fix Excess kernel-doc description warning Remove the @p: lines to prevent the kernel-doc warning: include/linux/device/class.h:72: warning: Excess struct member 'p' description in 'class' Signed-off-by: Randy Dunlap Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20231223050522.13867-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- include/linux/device/class.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device/class.h b/include/linux/device/class.h index abf3d3bfb6fe4..c576b49c55c22 100644 --- a/include/linux/device/class.h +++ b/include/linux/device/class.h @@ -40,8 +40,6 @@ struct fwnode_handle; * for the devices belonging to the class. Usually tied to * device's namespace. * @pm: The default device power management operations of this class. - * @p: The private data of the driver core, no one other than the - * driver core can touch this. * * A class is a higher-level view of a device that abstracts out low-level * implementation details. Drivers may see a SCSI disk or an ATA disk, but, -- cgit v1.2.3 From ae4d90f7ca49eb71f8a3dca64d06d4c4e2193705 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 22 Dec 2023 21:05:32 -0800 Subject: driver core: device.h: fix Excess kernel-doc description warning Remove the @knode_class: line to prevent the kernel-doc warning: include/linux/device.h:807: warning: Excess struct member 'knode_class' description in 'device' Signed-off-by: Randy Dunlap Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20231223050532.13881-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index aefc5ca7f1cfc..ed600dbf950e3 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -662,7 +662,6 @@ struct device_physical_location { * @id: device instance * @devres_lock: Spinlock to protect the resource of the device. * @devres_head: The resources list of the device. - * @knode_class: The node used to add the device to the class list. * @class: The class of the device. * @groups: Optional attribute groups. * @release: Callback to free the device after all references have -- cgit v1.2.3 From 1760bfa7d7ca490cf8a61fe50ddeb1769cadd89e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 22 Dec 2023 21:06:36 -0800 Subject: usb: linux/usb.h: fix Excess kernel-doc description warning Remove the @removable: line to prevent the kernel-doc warning: include/linux/usb.h:732: warning: Excess struct member 'removable' description in 'usb_device' Signed-off-by: Randy Dunlap Cc: Greg Kroah-Hartman Cc: linux-usb@vger.kernel.org Link: https://lore.kernel.org/r/20231223050636.14022-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 618e5a0b1a223..07556341ba2b4 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -632,7 +632,6 @@ struct usb3_lpm_parameters { * @reset_resume: needs reset instead of resume * @port_is_suspended: the upstream port is suspended (L2 or U3) * @slot_id: Slot ID assigned by xHCI - * @removable: Device can be physically removed from this port * @l1_params: best effor service latency for USB2 L1 LPM state, and L1 timeout. * @u1_params: exit latencies for USB3 U1 LPM state, and hub-initiated timeout. * @u2_params: exit latencies for USB3 U2 LPM state, and hub-initiated timeout. -- cgit v1.2.3 From f6847807c22f6944c71c981b630b9fff30801e73 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 22 Nov 2023 23:18:11 +0100 Subject: linux/export: Fix alignment for 64-bit ksymtab entries An alignment of 4 bytes is wrong for 64-bit platforms which don't define CONFIG_HAVE_ARCH_PREL32_RELOCATIONS (which then store 64-bit pointers). Fix their alignment to 8 bytes. Fixes: ddb5cdbafaaa ("kbuild: generate KSYMTAB entries by modpost") Signed-off-by: Helge Deller Signed-off-by: Masahiro Yamada --- include/linux/export-internal.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h index 69501e0ec239f..cd253eb51d6c0 100644 --- a/include/linux/export-internal.h +++ b/include/linux/export-internal.h @@ -16,10 +16,13 @@ * and eliminates the need for absolute relocations that require runtime * processing on relocatable kernels. */ +#define __KSYM_ALIGN ".balign 4" #define __KSYM_REF(sym) ".long " #sym "- ." #elif defined(CONFIG_64BIT) +#define __KSYM_ALIGN ".balign 8" #define __KSYM_REF(sym) ".quad " #sym #else +#define __KSYM_ALIGN ".balign 4" #define __KSYM_REF(sym) ".long " #sym #endif @@ -42,7 +45,7 @@ " .asciz \"" ns "\"" "\n" \ " .previous" "\n" \ " .section \"___ksymtab" sec "+" #name "\", \"a\"" "\n" \ - " .balign 4" "\n" \ + __KSYM_ALIGN "\n" \ "__ksymtab_" #name ":" "\n" \ __KSYM_REF(sym) "\n" \ __KSYM_REF(__kstrtab_ ##name) "\n" \ -- cgit v1.2.3 From f91a704f7161c2cf0fcd41fa9fbec4355b813fff Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 2 Oct 2023 17:19:46 +0300 Subject: fs: prepare for stackable filesystems backing file helpers In preparation for factoring out some backing file io helpers from overlayfs, move backing_file_open() into a new file fs/backing-file.c and header. Add a MAINTAINERS entry for stackable filesystems and add a Kconfig FS_STACK which stackable filesystems need to select. For now, the backing_file struct, the backing_file alloc/free functions and the backing_file_real_path() accessor remain internal to file_table.c. We may change that in the future. Signed-off-by: Amir Goldstein --- include/linux/backing-file.h | 17 +++++++++++++++++ include/linux/fs.h | 3 --- 2 files changed, 17 insertions(+), 3 deletions(-) create mode 100644 include/linux/backing-file.h (limited to 'include/linux') diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h new file mode 100644 index 0000000000000..55c9e804f7804 --- /dev/null +++ b/include/linux/backing-file.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Common helpers for stackable filesystems and backing files. + * + * Copyright (C) 2023 CTERA Networks. + */ + +#ifndef _LINUX_BACKING_FILE_H +#define _LINUX_BACKING_FILE_H + +#include + +struct file *backing_file_open(const struct path *user_path, int flags, + const struct path *real_path, + const struct cred *cred); + +#endif /* _LINUX_BACKING_FILE_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 900d0cd55b50f..db5d07e6e02ee 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2575,9 +2575,6 @@ struct file *dentry_open(const struct path *path, int flags, const struct cred *creds); struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred); -struct file *backing_file_open(const struct path *user_path, int flags, - const struct path *real_path, - const struct cred *cred); struct path *backing_file_user_path(struct file *f); /* -- cgit v1.2.3 From a6293b3e285cd0d7692141d7981a5f144f0e2f0b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2023 17:48:52 +0200 Subject: fs: factor out backing_file_{read,write}_iter() helpers Overlayfs submits files io to backing files on other filesystems. Factor out some common helpers to perform io to backing files, into fs/backing-file.c. Suggested-by: Miklos Szeredi Link: https://lore.kernel.org/r/CAJfpeguhmZbjP3JLqtUy0AdWaHOkAPWeP827BBWwRFEAUgnUcQ@mail.gmail.com Signed-off-by: Amir Goldstein --- include/linux/backing-file.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h index 55c9e804f7804..0648d548a4180 100644 --- a/include/linux/backing-file.h +++ b/include/linux/backing-file.h @@ -9,9 +9,24 @@ #define _LINUX_BACKING_FILE_H #include +#include +#include + +struct backing_file_ctx { + const struct cred *cred; + struct file *user_file; + void (*accessed)(struct file *); + void (*end_write)(struct file *); +}; struct file *backing_file_open(const struct path *user_path, int flags, const struct path *real_path, const struct cred *cred); +ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx); +ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, + struct kiocb *iocb, int flags, + struct backing_file_ctx *ctx); #endif /* _LINUX_BACKING_FILE_H */ -- cgit v1.2.3 From 9b7e9e2f5d5c3d079ec46bc71b114012e362ea6e Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 13 Oct 2023 12:13:12 +0300 Subject: fs: factor out backing_file_splice_{read,write}() helpers There is not much in those helpers, but it makes sense to have them logically next to the backing_file_{read,write}_iter() helpers as they may grow more common logic in the future. Signed-off-by: Amir Goldstein --- include/linux/backing-file.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h index 0648d548a4180..0546d5b1c9f59 100644 --- a/include/linux/backing-file.h +++ b/include/linux/backing-file.h @@ -28,5 +28,13 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, struct kiocb *iocb, int flags, struct backing_file_ctx *ctx); +ssize_t backing_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx); +ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, size_t len, + unsigned int flags, + struct backing_file_ctx *ctx); #endif /* _LINUX_BACKING_FILE_H */ -- cgit v1.2.3 From f567377e406c032fff0799bde4fdf4a977529b84 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 13 Oct 2023 12:49:37 +0300 Subject: fs: factor out backing_file_mmap() helper Assert that the file object is allocated in a backing_file container so that file_user_path() could be used to display the user path and not the backing file's path in /proc//maps. Signed-off-by: Amir Goldstein --- include/linux/backing-file.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h index 0546d5b1c9f59..3f1fe1774f1b6 100644 --- a/include/linux/backing-file.h +++ b/include/linux/backing-file.h @@ -36,5 +36,7 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags, struct backing_file_ctx *ctx); +int backing_file_mmap(struct file *file, struct vm_area_struct *vma, + struct backing_file_ctx *ctx); #endif /* _LINUX_BACKING_FILE_H */ -- cgit v1.2.3 From 9942cb22ea458c34fa17b73d143ea32d4df1caca Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 11 Dec 2023 11:48:49 +0100 Subject: sched/topology: Add a new arch_scale_freq_ref() method Create a new method to get a unique and fixed max frequency. Currently cpuinfo.max_freq or the highest (or last) state of performance domain are used as the max frequency when computing the frequency for a level of utilization, but: - cpuinfo_max_freq can change at runtime. boost is one example of such change. - cpuinfo.max_freq and last item of the PD can be different leading to different results between cpufreq and energy model. We need to save the reference frequency that has been used when computing the CPUs capacity and use this fixed and coherent value to convert between frequency and CPU's capacity. In fact, we already save the frequency that has been used when computing the capacity of each CPU. We extend the precision to save kHz instead of MHz currently and we modify the type to be aligned with other variables used when converting frequency to capacity and the other way. [ mingo: Minor edits. ] Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Lukasz Luba Acked-by: Sudeep Holla Link: https://lore.kernel.org/r/20231211104855.558096-2-vincent.guittot@linaro.org --- include/linux/arch_topology.h | 7 +++++++ include/linux/sched/topology.h | 8 ++++++++ 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index a07b510e7dc55..32c24ff4f2a80 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -27,6 +27,13 @@ static inline unsigned long topology_get_cpu_scale(int cpu) void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity); +DECLARE_PER_CPU(unsigned long, capacity_freq_ref); + +static inline unsigned long topology_get_freq_ref(int cpu) +{ + return per_cpu(capacity_freq_ref, cpu); +} + DECLARE_PER_CPU(unsigned long, arch_freq_scale); static inline unsigned long topology_get_freq_scale(int cpu) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index de545ba852189..a6e04b4a21d70 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -279,6 +279,14 @@ void arch_update_thermal_pressure(const struct cpumask *cpus, { } #endif +#ifndef arch_scale_freq_ref +static __always_inline +unsigned int arch_scale_freq_ref(int cpu) +{ + return 0; +} +#endif + static inline int task_node(const struct task_struct *p) { return cpu_to_node(task_cpu(p)); -- cgit v1.2.3 From 599457ba15403037b489fe536266a3d5f9efaed7 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 11 Dec 2023 11:48:50 +0100 Subject: cpufreq: Use the fixed and coherent frequency for scaling capacity cpuinfo.max_freq can change at runtime because of boost as an example. This implies that the value could be different from the frequency that has been used to compute the capacity of a CPU. The new arch_scale_freq_ref() returns a fixed and coherent frequency that can be used to compute the capacity for a given frequency. [ Also fix a arch_set_freq_scale() newline style wart in . ] Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Lukasz Luba Acked-by: Viresh Kumar Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20231211104855.558096-3-vincent.guittot@linaro.org --- include/linux/cpufreq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 1c5ca92a0555f..afda5f24d3ddc 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -1203,6 +1203,7 @@ void arch_set_freq_scale(const struct cpumask *cpus, { } #endif + /* the following are really really optional */ extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs; extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs; -- cgit v1.2.3 From 15cbbd1d317e07b4e5c6aca5d4c5579539a82784 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 11 Dec 2023 11:48:52 +0100 Subject: energy_model: Use a fixed reference frequency The last item of a performance domain is not always the performance point that has been used to compute CPU's capacity. This can lead to different target frequency compared with other part of the system like schedutil and would result in wrong energy estimation. A new arch_scale_freq_ref() is available to return a fixed and coherent frequency reference that can be used when computing the CPU's frequency for an level of utilization. Use this function to get this reference frequency. Energy model is never used without defining arch_scale_freq_ref() but can be compiled. Define a default arch_scale_freq_ref() returning 0 in such case. Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20231211104855.558096-5-vincent.guittot@linaro.org --- include/linux/energy_model.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index adec808b371a1..88d91e0874718 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -224,7 +224,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, unsigned long max_util, unsigned long sum_util, unsigned long allowed_cpu_cap) { - unsigned long freq, scale_cpu; + unsigned long freq, ref_freq, scale_cpu; struct em_perf_state *ps; int cpu; @@ -241,10 +241,10 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, */ cpu = cpumask_first(to_cpumask(pd->cpus)); scale_cpu = arch_scale_cpu_capacity(cpu); - ps = &pd->table[pd->nr_perf_states - 1]; + ref_freq = arch_scale_freq_ref(cpu); max_util = min(max_util, allowed_cpu_cap); - freq = map_util_freq(max_util, ps->frequency, scale_cpu); + freq = map_util_freq(max_util, ref_freq, scale_cpu); /* * Find the lowest performance state of the Energy Model above the -- cgit v1.2.3 From 1f023007f5e782bda19ad9104830c404fd622c5d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 11 Dec 2023 11:48:55 +0100 Subject: arm64/amu: Use capacity_ref_freq() to set AMU ratio Use the new capacity_ref_freq() method to set the ratio that is used by AMU for computing the arch_scale_freq_capacity(). This helps to keep everything aligned using the same reference for computing CPUs capacity. The default value of the ratio (stored in per_cpu(arch_max_freq_scale)) ensures that arch_scale_freq_capacity() returns max capacity until it is set to its correct value with the cpu capacity and capacity_ref_freq(). Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Acked-by: Sudeep Holla Acked-by: Will Deacon Link: https://lore.kernel.org/r/20231211104855.558096-8-vincent.guittot@linaro.org --- include/linux/arch_topology.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 32c24ff4f2a80..a63d61ca55afc 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -99,6 +99,7 @@ void update_siblings_masks(unsigned int cpu); void remove_cpu_topology(unsigned int cpuid); void reset_cpu_topology(void); int parse_acpi_topology(void); +void freq_inv_set_max_ratio(int cpu, u64 max_rate); #endif #endif /* _LINUX_ARCH_TOPOLOGY_H_ */ -- cgit v1.2.3 From 11137d384996bb05cf33c8163db271e1bac3f4bf Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 1 Dec 2023 17:16:52 +0100 Subject: sched/fair: Simplify util_est With UTIL_EST_FASTUP now being permanent, we can take advantage of the fact that the ewma jumps directly to a higher utilization at dequeue to simplify util_est and remove the enqueued field. Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Lukasz Luba Reviewed-by: Dietmar Eggemann Reviewed-by: Hongyan Xia Reviewed-by: Alex Shi Link: https://lore.kernel.org/r/20231201161652.1241695-3-vincent.guittot@linaro.org --- include/linux/sched.h | 49 ++++++++++++------------------------------------- 1 file changed, 12 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8d258162deb0a..03bfe9ab29511 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -415,42 +415,6 @@ struct load_weight { u32 inv_weight; }; -/** - * struct util_est - Estimation utilization of FAIR tasks - * @enqueued: instantaneous estimated utilization of a task/cpu - * @ewma: the Exponential Weighted Moving Average (EWMA) - * utilization of a task - * - * Support data structure to track an Exponential Weighted Moving Average - * (EWMA) of a FAIR task's utilization. New samples are added to the moving - * average each time a task completes an activation. Sample's weight is chosen - * so that the EWMA will be relatively insensitive to transient changes to the - * task's workload. - * - * The enqueued attribute has a slightly different meaning for tasks and cpus: - * - task: the task's util_avg at last task dequeue time - * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU - * Thus, the util_est.enqueued of a task represents the contribution on the - * estimated utilization of the CPU where that task is currently enqueued. - * - * Only for tasks we track a moving average of the past instantaneous - * estimated utilization. This allows to absorb sporadic drops in utilization - * of an otherwise almost periodic task. - * - * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg - * updates. When a task is dequeued, its util_est should not be updated if its - * util_avg has not been updated in the meantime. - * This information is mapped into the MSB bit of util_est.enqueued at dequeue - * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg - * for a task) it is safe to use MSB. - */ -struct util_est { - unsigned int enqueued; - unsigned int ewma; -#define UTIL_EST_WEIGHT_SHIFT 2 -#define UTIL_AVG_UNCHANGED 0x80000000 -} __attribute__((__aligned__(sizeof(u64)))); - /* * The load/runnable/util_avg accumulates an infinite geometric series * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). @@ -505,9 +469,20 @@ struct sched_avg { unsigned long load_avg; unsigned long runnable_avg; unsigned long util_avg; - struct util_est util_est; + unsigned int util_est; } ____cacheline_aligned; +/* + * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg + * updates. When a task is dequeued, its util_est should not be updated if its + * util_avg has not been updated in the meantime. + * This information is mapped into the MSB bit of util_est at dequeue time. + * Since max value of util_est for a task is 1024 (PELT util_avg for a task) + * it is safe to use MSB. + */ +#define UTIL_EST_WEIGHT_SHIFT 2 +#define UTIL_AVG_UNCHANGED 0x80000000 + struct sched_statistics { #ifdef CONFIG_SCHEDSTATS u64 wait_start; -- cgit v1.2.3 From 4498a8eccc97de3d65f876b6fdeddb439ef73abc Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 20 Nov 2023 17:09:47 +0000 Subject: netfs, fscache: Remove ->begin_cache_operation Remove ->begin_cache_operation() in favour of just calling fscache directly. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Christian Brauner cc: linux-fsdevel@vger.kernel.org cc: linux-cachefs@redhat.com --- include/linux/fscache.h | 3 --- include/linux/netfs.h | 4 +--- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 8e312c8323a8e..9ed6696aee7ab 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -437,9 +437,6 @@ const struct netfs_cache_ops *fscache_operation_valid(const struct netfs_cache_r * indicates the cache resources to which the operation state should be * attached; @cookie indicates the cache object that will be accessed. * - * This is intended to be called from the ->begin_cache_operation() netfs lib - * operation as implemented by the network filesystem. - * * @cres->inval_counter is set from @cookie->inval_counter for comparison at * the end of the operation. This allows invalidation during the operation to * be detected by the caller. diff --git a/include/linux/netfs.h b/include/linux/netfs.h index b11a84f6c32b7..d294ff8f9ae45 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -208,7 +208,6 @@ struct netfs_io_request { struct netfs_request_ops { int (*init_request)(struct netfs_io_request *rreq, struct file *file); void (*free_request)(struct netfs_io_request *rreq); - int (*begin_cache_operation)(struct netfs_io_request *rreq); void (*expand_readahead)(struct netfs_io_request *rreq); bool (*clamp_length)(struct netfs_io_subrequest *subreq); @@ -229,8 +228,7 @@ enum netfs_read_from_hole { }; /* - * Table of operations for access to a cache. This is obtained by - * rreq->ops->begin_cache_operation(). + * Table of operations for access to a cache. */ struct netfs_cache_ops { /* End an operation */ -- cgit v1.2.3 From 7eb5b3e3a0a55f2d166ca949ef47ca6e0c704aab Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 21 Nov 2023 15:43:52 +0000 Subject: netfs, fscache: Move /proc/fs/fscache to /proc/fs/netfs and put in a symlink Rename /proc/fs/fscache to "netfs" and make a symlink from fscache to that. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Christian Brauner cc: linux-fsdevel@vger.kernel.org cc: linux-cachefs@redhat.com --- include/linux/netfs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index d294ff8f9ae45..9bd91cd615d51 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -294,7 +294,6 @@ void netfs_get_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what); void netfs_put_subrequest(struct netfs_io_subrequest *subreq, bool was_async, enum netfs_sreq_ref_trace what); -void netfs_stats_show(struct seq_file *); ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, struct iov_iter *new, iov_iter_extraction_t extraction_flags); -- cgit v1.2.3 From c9c4ff12df110feb1b91951010f673f4b16e49e8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 27 Nov 2023 13:58:07 +0000 Subject: netfs: Move pinning-for-writeback from fscache to netfs Move the resource pinning-for-writeback from fscache code to netfslib code. This is used to keep a cache backing object pinned whilst we have dirty pages on the netfs inode in the pagecache such that VM writeback will be able to reach it. Whilst we're at it, switch the parameters of netfs_unpin_writeback() to match ->write_inode() so that it can be used for that directly. Note that this mechanism could be more generically useful than that for network filesystems. Quite often they have to keep around other resources (e.g. authentication tokens or network connections) until the writeback is complete. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/fs.h | 2 +- include/linux/fscache.h | 42 ------------------------------------------ include/linux/netfs.h | 3 +++ include/linux/writeback.h | 2 +- 4 files changed, 5 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e3..68a9572616947 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2294,7 +2294,7 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, #define I_CREATING (1 << 15) #define I_DONTCACHE (1 << 16) #define I_SYNC_QUEUED (1 << 17) -#define I_PINNING_FSCACHE_WB (1 << 18) +#define I_PINNING_NETFS_WB (1 << 18) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 9ed6696aee7ab..6e8562cbcc432 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -626,48 +626,6 @@ static inline void fscache_write_to_cache(struct fscache_cookie *cookie, } -#if __fscache_available -bool fscache_dirty_folio(struct address_space *mapping, struct folio *folio, - struct fscache_cookie *cookie); -#else -#define fscache_dirty_folio(MAPPING, FOLIO, COOKIE) \ - filemap_dirty_folio(MAPPING, FOLIO) -#endif - -/** - * fscache_unpin_writeback - Unpin writeback resources - * @wbc: The writeback control - * @cookie: The cookie referring to the cache object - * - * Unpin the writeback resources pinned by fscache_dirty_folio(). This is - * intended to be called by the netfs's ->write_inode() method. - */ -static inline void fscache_unpin_writeback(struct writeback_control *wbc, - struct fscache_cookie *cookie) -{ - if (wbc->unpinned_fscache_wb) - fscache_unuse_cookie(cookie, NULL, NULL); -} - -/** - * fscache_clear_inode_writeback - Clear writeback resources pinned by an inode - * @cookie: The cookie referring to the cache object - * @inode: The inode to clean up - * @aux: Auxiliary data to apply to the inode - * - * Clear any writeback resources held by an inode when the inode is evicted. - * This must be called before clear_inode() is called. - */ -static inline void fscache_clear_inode_writeback(struct fscache_cookie *cookie, - struct inode *inode, - const void *aux) -{ - if (inode->i_state & I_PINNING_FSCACHE_WB) { - loff_t i_size = i_size_read(inode); - fscache_unuse_cookie(cookie, aux, &i_size); - } -} - /** * fscache_note_page_release - Note that a netfs page got released * @cookie: The cookie corresponding to the file diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 9bd91cd615d51..32faf6c897029 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -288,6 +288,9 @@ int netfs_read_folio(struct file *, struct folio *); int netfs_write_begin(struct netfs_inode *, struct file *, struct address_space *, loff_t pos, unsigned int len, struct folio **, void **fsdata); +bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio); +int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc); +void netfs_clear_inode_writeback(struct inode *inode, const void *aux); void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool); void netfs_get_subrequest(struct netfs_io_subrequest *subreq, diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 083387c00f0c8..1e08392fb43e1 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -60,7 +60,7 @@ struct writeback_control { unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ - unsigned unpinned_fscache_wb:1; /* Cleared I_PINNING_FSCACHE_WB */ + unsigned unpinned_netfs_wb:1; /* Cleared I_PINNING_NETFS_WB */ /* * When writeback IOs are bounced through async layers, only the -- cgit v1.2.3 From 87b57a048964abfd5f3d8b79bc55687327f5a380 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 4 Mar 2022 10:34:27 +0000 Subject: netfs: Add a procfile to list in-progress requests Add a procfile, /proc/fs/netfs/requests, to list in-progress netfslib I/O requests. Signed-off-by: David Howells cc: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 32faf6c897029..7244ddebd974c 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -175,10 +175,14 @@ enum netfs_io_origin { * operations to a variety of data stores and then stitch the result together. */ struct netfs_io_request { - struct work_struct work; + union { + struct work_struct work; + struct rcu_head rcu; + }; struct inode *inode; /* The file being accessed */ struct address_space *mapping; /* The mapping being accessed */ struct netfs_cache_resources cache_resources; + struct list_head proc_link; /* Link in netfs_iorequests */ struct list_head subrequests; /* Contributory I/O operations */ void *netfs_priv; /* Private data for the netfs */ unsigned int debug_id; -- cgit v1.2.3 From cc3cb0a18da46a51d9fc173155576ba1d068e536 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 9 Mar 2022 11:01:12 +0000 Subject: netfs: Allow the netfs to make the io (sub)request alloc larger Allow the network filesystem to specify extra space to be allocated on the end of the io (sub)request. This allows cifs, for example, to use this space rather than allocating its own cifs_readdata struct. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 7244ddebd974c..d6f27000eeb07 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -210,6 +210,8 @@ struct netfs_io_request { * Operations the network filesystem can/must provide to the helpers. */ struct netfs_request_ops { + unsigned int io_request_size; /* Alloc size for netfs_io_request struct */ + unsigned int io_subrequest_size; /* Alloc size for netfs_io_subrequest struct */ int (*init_request)(struct netfs_io_request *rreq, struct file *file); void (*free_request)(struct netfs_io_request *rreq); -- cgit v1.2.3 From 5f5ce7ba15e7e6a6539ac8e1f845757aaebecf0d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 25 Feb 2022 11:19:14 +0000 Subject: netfs: Add a ->free_subrequest() op Add a ->free_subrequest() op so that the netfs can clean up data attached to a subrequest. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index d6f27000eeb07..06f57d9d09f6c 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -214,6 +214,7 @@ struct netfs_request_ops { unsigned int io_subrequest_size; /* Alloc size for netfs_io_subrequest struct */ int (*init_request)(struct netfs_io_request *rreq, struct file *file); void (*free_request)(struct netfs_io_request *rreq); + void (*free_subrequest)(struct netfs_io_subrequest *rreq); void (*expand_readahead)(struct netfs_io_request *rreq); bool (*clamp_length)(struct netfs_io_subrequest *subreq); -- cgit v1.2.3 From c1ec4d7c2e13471558cfea302b7583856284f94c Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 20 Aug 2021 17:08:30 +0100 Subject: netfs: Provide invalidate_folio and release_folio calls Provide default invalidate_folio and release_folio calls. These will need to interact with invalidation correctly at some point. They will be needed if netfslib is to make use of folio->private for its own purposes. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 06f57d9d09f6c..8efbfd3b28202 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -293,11 +293,13 @@ struct readahead_control; void netfs_readahead(struct readahead_control *); int netfs_read_folio(struct file *, struct folio *); int netfs_write_begin(struct netfs_inode *, struct file *, - struct address_space *, loff_t pos, unsigned int len, - struct folio **, void **fsdata); + struct address_space *, loff_t pos, unsigned int len, + struct folio **, void **fsdata); bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio); int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc); void netfs_clear_inode_writeback(struct inode *inode, const void *aux); +void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length); +bool netfs_release_folio(struct folio *folio, gfp_t gfp); void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool); void netfs_get_subrequest(struct netfs_io_subrequest *subreq, -- cgit v1.2.3 From 46ed60dcd4f2c94d27735743ce55cd8d6b93cc1d Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 11 Oct 2023 15:34:07 +0100 Subject: netfs: Implement unbuffered/DIO vs buffered I/O locking Borrow NFS's direct-vs-buffered I/O locking into netfslib. Similar code is also used in ceph. Modify it to have the correct checker annotations for i_rwsem lock acquisition/release and to return -ERESTARTSYS if waits are interrupted. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 8efbfd3b28202..fc6d9756a0294 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -129,6 +129,8 @@ struct netfs_inode { struct fscache_cookie *cache; #endif loff_t remote_i_size; /* Size of the remote file */ + unsigned long flags; +#define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ }; /* @@ -310,6 +312,13 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, struct iov_iter *new, iov_iter_extraction_t extraction_flags); +int netfs_start_io_read(struct inode *inode); +void netfs_end_io_read(struct inode *inode); +int netfs_start_io_write(struct inode *inode); +void netfs_end_io_write(struct inode *inode); +int netfs_start_io_direct(struct inode *inode); +void netfs_end_io_direct(struct inode *inode); + /** * netfs_inode - Get the netfs inode context from the inode * @inode: The inode to query @@ -335,6 +344,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, { ctx->ops = ops; ctx->remote_i_size = i_size_read(&ctx->inode); + ctx->flags = 0; #if IS_ENABLED(CONFIG_FSCACHE) ctx->cache = NULL; #endif -- cgit v1.2.3 From 92b6cc5d1e7cbe569f00e9c1249ac8214fd5e2d2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 26 Sep 2023 17:42:26 +0100 Subject: netfs: Add iov_iters to (sub)requests to describe various buffers Add three iov_iter structs: (1) Add an iov_iter (->iter) to the I/O request to describe the unencrypted-side buffer. (2) Add an iov_iter (->io_iter) to the I/O request to describe the encrypted-side I/O buffer. This may be a different size to the buffer in (1). (3) Add an iov_iter (->io_iter) to the I/O subrequest to describe the part of the I/O buffer for that subrequest. This will allow future patches to point to a bounce buffer instead for purposes of handling oversize writes, decryption (where we want to save the encrypted data to the cache) and decompression. These iov_iters persist for the lifetime of the (sub)request, and so can be accessed multiple times without worrying about them being deallocated upon return to the caller. The network filesystem must appropriately advance the iterator before terminating the request. Signed-off-by: David Howells cc: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index fc6d9756a0294..3da962e977f55 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -150,6 +150,7 @@ struct netfs_cache_resources { struct netfs_io_subrequest { struct netfs_io_request *rreq; /* Supervising I/O request */ struct list_head rreq_link; /* Link in rreq->subrequests */ + struct iov_iter io_iter; /* Iterator for this subrequest */ loff_t start; /* Where to start the I/O */ size_t len; /* Size of the I/O */ size_t transferred; /* Amount of data transferred */ @@ -186,6 +187,8 @@ struct netfs_io_request { struct netfs_cache_resources cache_resources; struct list_head proc_link; /* Link in netfs_iorequests */ struct list_head subrequests; /* Contributory I/O operations */ + struct iov_iter iter; /* Unencrypted-side iterator */ + struct iov_iter io_iter; /* I/O (Encrypted-side) iterator */ void *netfs_priv; /* Private data for the netfs */ unsigned int debug_id; atomic_t nr_outstanding; /* Number of ops in progress */ -- cgit v1.2.3 From f1bb47a31dff6d4b34fb14e99850860ee74bb003 Mon Sep 17 00:00:00 2001 From: Alfred Piccioni Date: Tue, 19 Dec 2023 10:09:09 +0100 Subject: lsm: new security_file_ioctl_compat() hook Some ioctl commands do not require ioctl permission, but are routed to other permissions such as FILE_GETATTR or FILE_SETATTR. This routing is done by comparing the ioctl cmd to a set of 64-bit flags (FS_IOC_*). However, if a 32-bit process is running on a 64-bit kernel, it emits 32-bit flags (FS_IOC32_*) for certain ioctl operations. These flags are being checked erroneously, which leads to these ioctl operations being routed to the ioctl permission, rather than the correct file permissions. This was also noted in a RED-PEN finding from a while back - "/* RED-PEN how should LSM module know it's handling 32bit? */". This patch introduces a new hook, security_file_ioctl_compat(), that is called from the compat ioctl syscall. All current LSMs have been changed to support this hook. Reviewing the three places where we are currently using security_file_ioctl(), it appears that only SELinux needs a dedicated compat change; TOMOYO and SMACK appear to be functional without any change. Cc: stable@vger.kernel.org Fixes: 0b24dcb7f2f7 ("Revert "selinux: simplify ioctl checking"") Signed-off-by: Alfred Piccioni Reviewed-by: Stephen Smalley [PM: subject tweak, line length fixes, and alignment corrections] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 ++ include/linux/security.h | 9 +++++++++ 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index c925a0d26edfe..185924c563787 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -171,6 +171,8 @@ LSM_HOOK(int, 0, file_alloc_security, struct file *file) LSM_HOOK(void, LSM_RET_VOID, file_free_security, struct file *file) LSM_HOOK(int, 0, file_ioctl, struct file *file, unsigned int cmd, unsigned long arg) +LSM_HOOK(int, 0, file_ioctl_compat, struct file *file, unsigned int cmd, + unsigned long arg) LSM_HOOK(int, 0, mmap_addr, unsigned long addr) LSM_HOOK(int, 0, mmap_file, struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) diff --git a/include/linux/security.h b/include/linux/security.h index 750130a7b9dd2..d0eb20f90b264 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -394,6 +394,8 @@ int security_file_permission(struct file *file, int mask); int security_file_alloc(struct file *file); void security_file_free(struct file *file); int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int security_file_ioctl_compat(struct file *file, unsigned int cmd, + unsigned long arg); int security_mmap_file(struct file *file, unsigned long prot, unsigned long flags); int security_mmap_addr(unsigned long addr); @@ -1002,6 +1004,13 @@ static inline int security_file_ioctl(struct file *file, unsigned int cmd, return 0; } +static inline int security_file_ioctl_compat(struct file *file, + unsigned int cmd, + unsigned long arg) +{ + return 0; +} + static inline int security_mmap_file(struct file *file, unsigned long prot, unsigned long flags) { -- cgit v1.2.3 From 8645e659e2d227f6ce8fcea1ac640c324fbbb3e6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 22 Dec 2023 21:05:56 -0800 Subject: iio: linux/iio.h: fix Excess kernel-doc description warning Remove the @of_xlate: lines to prevent the kernel-doc warning: include/linux/iio/iio.h:534: warning: Excess struct member 'of_xlate' description in 'iio_info' Signed-off-by: Randy Dunlap Cc: Jonathan Cameron Cc: Lars-Peter Clausen Cc: linux-iio@vger.kernel.org Link: https://lore.kernel.org/r/20231223050556.13948-1-rdunlap@infradead.org Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index d0ce3b71106aa..c5b36d2c1e735 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -434,13 +434,7 @@ struct iio_trigger; /* forward declaration */ * @update_scan_mode: function to configure device and scan buffer when * channels have changed * @debugfs_reg_access: function to read or write register value of device - * @of_xlate: function pointer to obtain channel specifier index. - * When #iio-cells is greater than '0', the driver could - * provide a custom of_xlate function that reads the - * *args* and returns the appropriate index in registered - * IIO channels array. * @fwnode_xlate: fwnode based function pointer to obtain channel specifier index. - * Functionally the same as @of_xlate. * @hwfifo_set_watermark: function pointer to set the current hardware * fifo watermark level; see hwfifo_* entries in * Documentation/ABI/testing/sysfs-bus-iio for details on -- cgit v1.2.3 From 02d374f3418df577c850f0cd45c3da9245ead547 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Dec 2023 08:15:24 +0000 Subject: block: renumber QUEUE_FLAG_HW_WC For the QUEUE_FLAG_HW_WC to actually work, it needs to have a separate number from QUEUE_FLAG_FUA, doh. Fixes: 43c9835b144c ("block: don't allow enabling a cache on devices that don't support it") Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231226081524.180289-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 51fa7ffdee83b..88e9dd4b71fba 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -538,7 +538,7 @@ struct request_queue { #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ #define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ -#define QUEUE_FLAG_HW_WC 18 /* Write back caching supported */ +#define QUEUE_FLAG_HW_WC 13 /* Write back caching supported */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ #define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ #define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ -- cgit v1.2.3 From 1c042f8d4bc342b7985b1de3d76836f1a1083b65 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Dec 2023 08:05:38 +0100 Subject: block: reject invalid operation in submit_bio_noacct submit_bio_noacct allows completely invalid operations, or operations that are not supported in the bio path. Extent the existing switch statement to rejcect all invalid types. Move the code point for REQ_OP_ZONE_APPEND so that it's not right in the middle of the zone management operations and the switch statement can follow the numerical order of the operations. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231221070538.1112446-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d5c5e59ddbd25..68c9eb2374a46 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -378,6 +378,8 @@ enum req_op { REQ_OP_DISCARD = (__force blk_opf_t)3, /* securely erase sectors */ REQ_OP_SECURE_ERASE = (__force blk_opf_t)5, + /* write data at the current zone write pointer */ + REQ_OP_ZONE_APPEND = (__force blk_opf_t)7, /* write the zero filled sector many times */ REQ_OP_WRITE_ZEROES = (__force blk_opf_t)9, /* Open a zone */ @@ -386,12 +388,10 @@ enum req_op { REQ_OP_ZONE_CLOSE = (__force blk_opf_t)11, /* Transition a zone to full */ REQ_OP_ZONE_FINISH = (__force blk_opf_t)12, - /* write data at the current zone write pointer */ - REQ_OP_ZONE_APPEND = (__force blk_opf_t)13, /* reset a zone write pointer */ - REQ_OP_ZONE_RESET = (__force blk_opf_t)15, + REQ_OP_ZONE_RESET = (__force blk_opf_t)13, /* reset all the zone present on the device */ - REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)17, + REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)15, /* Driver private requests */ REQ_OP_DRV_IN = (__force blk_opf_t)34, -- cgit v1.2.3 From b3bf76024f645369e1fc45e0b08a2bd24f200d9b Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Tue, 19 Dec 2023 22:26:16 +0800 Subject: net/smc: manage system EID in SMC stack instead of ISM driver The System EID (SEID) is an internal EID that is used by the SMCv2 software stack that has a predefined and constant value representing the s390 physical machine that the OS is executing on. So it should be managed by SMC stack instead of ISM driver and be consistent for all ISMv2 device (including virtual ISM devices) on s390 architecture. Suggested-by: Alexandra Winter Signed-off-by: Wen Gu Reviewed-and-tested-by: Wenjia Zhang Reviewed-by: Alexandra Winter Signed-off-by: David S. Miller --- include/linux/ism.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ism.h b/include/linux/ism.h index 9a4c204df3da1..5428edd909823 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -86,7 +86,6 @@ int ism_register_dmb(struct ism_dev *dev, struct ism_dmb *dmb, int ism_unregister_dmb(struct ism_dev *dev, struct ism_dmb *dmb); int ism_move(struct ism_dev *dev, u64 dmb_tok, unsigned int idx, bool sf, unsigned int offset, void *data, unsigned int size); -u8 *ism_get_seid(void); const struct smcd_ops *ism_get_smcd_ops(void); -- cgit v1.2.3 From 0942155a48e4cfc2c83e514c86a3de8f78f6af02 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 20 Dec 2023 14:35:05 +0100 Subject: PCI: Remove unused 'node' member from struct pci_driver Remove the unused 'node' member. It got replaced by device_driver chaining more than 20 years ago in commit 4b4a837f2b57 ("PCI: start to use common fields of struct device_driver more...") of the history.git tree. Link: https://lore.kernel.org/r/20231220133505.8798-1-minipli@grsecurity.net Signed-off-by: Mathias Krause Signed-off-by: Bjorn Helgaas Acked-by: Kalle Valo --- include/linux/pci.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 60ca768bc8679..1a89dc66f89ac 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -885,7 +885,6 @@ struct module; /** * struct pci_driver - PCI driver structure - * @node: List of driver structures. * @name: Driver name. * @id_table: Pointer to table of device IDs the driver is * interested in. Most drivers should export this @@ -940,7 +939,6 @@ struct module; * own I/O address space. */ struct pci_driver { - struct list_head node; const char *name; const struct pci_device_id *id_table; /* Must be non-NULL for probe to be called */ int (*probe)(struct pci_dev *dev, const struct pci_device_id *id); /* New device inserted */ -- cgit v1.2.3 From ba367479c7ad0b870461024cd5ae7a1ea6e1e3db Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 19 Dec 2023 11:32:39 +0530 Subject: OPP: The level field is always of unsigned int type By mistake, dev_pm_opp_find_level_floor() used the level parameter as unsigned long instead of unsigned int. Fix it. Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 81dff7facdc98..74768c47d7904 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -163,7 +163,7 @@ struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev, unsigned int *level); struct dev_pm_opp *dev_pm_opp_find_level_floor(struct device *dev, - unsigned long *level); + unsigned int *level); struct dev_pm_opp *dev_pm_opp_find_bw_ceil(struct device *dev, unsigned int *bw, int index); @@ -330,7 +330,7 @@ static inline struct dev_pm_opp *dev_pm_opp_find_level_ceil(struct device *dev, } static inline struct dev_pm_opp *dev_pm_opp_find_level_floor(struct device *dev, - unsigned long *level) + unsigned int *level) { return ERR_PTR(-EOPNOTSUPP); } -- cgit v1.2.3 From 90abde49ea85a8af9a56bbab8c419aefc77f919a Mon Sep 17 00:00:00 2001 From: "Radu Pirea (NXP OSS)" Date: Tue, 19 Dec 2023 16:53:25 +0200 Subject: net: rename dsa_realloc_skb to skb_ensure_writable_head_tail Rename dsa_realloc_skb to skb_ensure_writable_head_tail and move it to skbuff.c to use it as helper. Signed-off-by: Radu Pirea (NXP OSS) Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 50e92c8471dc7..a5ae952454c89 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4007,6 +4007,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features unsigned int offset); struct sk_buff *skb_vlan_untag(struct sk_buff *skb); int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len); +int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev); int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); -- cgit v1.2.3 From 932562a6045ed613d45bd100db37114273c22077 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 15 Dec 2023 15:58:20 -0500 Subject: rseq: Split out rseq.h from sched.h We're trying to get sched.h down to more or less just types only, not code - rseq can live in its own header. This helps us kill the dependency on preempt.h in sched.h. Signed-off-by: Kent Overstreet --- include/linux/resume_user_mode.h | 1 + include/linux/rseq.h | 131 +++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 125 +------------------------------------ 3 files changed, 133 insertions(+), 124 deletions(-) create mode 100644 include/linux/rseq.h (limited to 'include/linux') diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h index f8f3e958e9cf2..e0135e0adae02 100644 --- a/include/linux/resume_user_mode.h +++ b/include/linux/resume_user_mode.h @@ -6,6 +6,7 @@ #include #include #include +#include #include /** diff --git a/include/linux/rseq.h b/include/linux/rseq.h new file mode 100644 index 0000000000000..bc8af3eb55987 --- /dev/null +++ b/include/linux/rseq.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +#ifndef _LINUX_RSEQ_H +#define _LINUX_RSEQ_H + +#ifdef CONFIG_RSEQ + +#include +#include + +/* + * Map the event mask on the user-space ABI enum rseq_cs_flags + * for direct mask checks. + */ +enum rseq_event_mask_bits { + RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, + RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, + RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, +}; + +enum rseq_event_mask { + RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), + RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), + RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), +}; + +static inline void rseq_set_notify_resume(struct task_struct *t) +{ + if (t->rseq) + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); +} + +void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); + +static inline void rseq_handle_notify_resume(struct ksignal *ksig, + struct pt_regs *regs) +{ + if (current->rseq) + __rseq_handle_notify_resume(ksig, regs); +} + +static inline void rseq_signal_deliver(struct ksignal *ksig, + struct pt_regs *regs) +{ + preempt_disable(); + __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); + preempt_enable(); + rseq_handle_notify_resume(ksig, regs); +} + +/* rseq_preempt() requires preemption to be disabled. */ +static inline void rseq_preempt(struct task_struct *t) +{ + __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); + rseq_set_notify_resume(t); +} + +/* rseq_migrate() requires preemption to be disabled. */ +static inline void rseq_migrate(struct task_struct *t) +{ + __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); + rseq_set_notify_resume(t); +} + +/* + * If parent process has a registered restartable sequences area, the + * child inherits. Unregister rseq for a clone with CLONE_VM set. + */ +static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +{ + if (clone_flags & CLONE_VM) { + t->rseq = NULL; + t->rseq_len = 0; + t->rseq_sig = 0; + t->rseq_event_mask = 0; + } else { + t->rseq = current->rseq; + t->rseq_len = current->rseq_len; + t->rseq_sig = current->rseq_sig; + t->rseq_event_mask = current->rseq_event_mask; + } +} + +static inline void rseq_execve(struct task_struct *t) +{ + t->rseq = NULL; + t->rseq_len = 0; + t->rseq_sig = 0; + t->rseq_event_mask = 0; +} + +#else + +static inline void rseq_set_notify_resume(struct task_struct *t) +{ +} +static inline void rseq_handle_notify_resume(struct ksignal *ksig, + struct pt_regs *regs) +{ +} +static inline void rseq_signal_deliver(struct ksignal *ksig, + struct pt_regs *regs) +{ +} +static inline void rseq_preempt(struct task_struct *t) +{ +} +static inline void rseq_migrate(struct task_struct *t) +{ +} +static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +{ +} +static inline void rseq_execve(struct task_struct *t) +{ +} + +#endif + +#ifdef CONFIG_DEBUG_RSEQ + +void rseq_syscall(struct pt_regs *regs); + +#else + +static inline void rseq_syscall(struct pt_regs *regs) +{ +} + +#endif + +#endif /* _LINUX_RSEQ_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index dd002d1937268..a588b94988bc4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include #include @@ -2181,129 +2181,6 @@ static inline bool owner_on_cpu(struct task_struct *owner) unsigned long sched_cpu_util(int cpu); #endif /* CONFIG_SMP */ -#ifdef CONFIG_RSEQ - -/* - * Map the event mask on the user-space ABI enum rseq_cs_flags - * for direct mask checks. - */ -enum rseq_event_mask_bits { - RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, - RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, - RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, -}; - -enum rseq_event_mask { - RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), - RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), - RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), -}; - -static inline void rseq_set_notify_resume(struct task_struct *t) -{ - if (t->rseq) - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); -} - -void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); - -static inline void rseq_handle_notify_resume(struct ksignal *ksig, - struct pt_regs *regs) -{ - if (current->rseq) - __rseq_handle_notify_resume(ksig, regs); -} - -static inline void rseq_signal_deliver(struct ksignal *ksig, - struct pt_regs *regs) -{ - preempt_disable(); - __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); - preempt_enable(); - rseq_handle_notify_resume(ksig, regs); -} - -/* rseq_preempt() requires preemption to be disabled. */ -static inline void rseq_preempt(struct task_struct *t) -{ - __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); - rseq_set_notify_resume(t); -} - -/* rseq_migrate() requires preemption to be disabled. */ -static inline void rseq_migrate(struct task_struct *t) -{ - __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); - rseq_set_notify_resume(t); -} - -/* - * If parent process has a registered restartable sequences area, the - * child inherits. Unregister rseq for a clone with CLONE_VM set. - */ -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) -{ - if (clone_flags & CLONE_VM) { - t->rseq = NULL; - t->rseq_len = 0; - t->rseq_sig = 0; - t->rseq_event_mask = 0; - } else { - t->rseq = current->rseq; - t->rseq_len = current->rseq_len; - t->rseq_sig = current->rseq_sig; - t->rseq_event_mask = current->rseq_event_mask; - } -} - -static inline void rseq_execve(struct task_struct *t) -{ - t->rseq = NULL; - t->rseq_len = 0; - t->rseq_sig = 0; - t->rseq_event_mask = 0; -} - -#else - -static inline void rseq_set_notify_resume(struct task_struct *t) -{ -} -static inline void rseq_handle_notify_resume(struct ksignal *ksig, - struct pt_regs *regs) -{ -} -static inline void rseq_signal_deliver(struct ksignal *ksig, - struct pt_regs *regs) -{ -} -static inline void rseq_preempt(struct task_struct *t) -{ -} -static inline void rseq_migrate(struct task_struct *t) -{ -} -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) -{ -} -static inline void rseq_execve(struct task_struct *t) -{ -} - -#endif - -#ifdef CONFIG_DEBUG_RSEQ - -void rseq_syscall(struct pt_regs *regs); - -#else - -static inline void rseq_syscall(struct pt_regs *regs) -{ -} - -#endif - #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); extern void sched_core_fork(struct task_struct *p); -- cgit v1.2.3 From 2b010a69350f2c995f40585fb801904874c85dd1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 15 Dec 2023 16:04:03 -0500 Subject: preempt.h: Kill dependency on list.h We really only need types.h, list.h is big. Signed-off-by: Kent Overstreet --- include/linux/preempt.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 9aa6358a1a16b..7233e9cf1bab6 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -9,7 +9,7 @@ #include #include -#include +#include /* * We put the hardirq and softirq counter into the preemption @@ -360,7 +360,9 @@ void preempt_notifier_unregister(struct preempt_notifier *notifier); static inline void preempt_notifier_init(struct preempt_notifier *notifier, struct preempt_ops *ops) { - INIT_HLIST_NODE(¬ifier->link); + /* INIT_HLIST_NODE() open coded, to avoid dependency on list.h */ + notifier->link.next = NULL; + notifier->link.pprev = NULL; notifier->ops = ops; } -- cgit v1.2.3 From 1e2f2d31997a9496f99e2b43255d6a48b06fbcc2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 15 Dec 2023 15:51:54 -0500 Subject: Kill sched.h dependency on rcupdate.h by moving cond_resched_rcu() to rcupdate_wait.h, we can kill another big sched.h dependency. Signed-off-by: Kent Overstreet --- include/linux/rcupdate_wait.h | 10 ++++++++++ include/linux/sched.h | 15 +++++---------- include/linux/sched/task.h | 1 + 3 files changed, 16 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h index 5e0f74f2f8ca5..d07f0848802e5 100644 --- a/include/linux/rcupdate_wait.h +++ b/include/linux/rcupdate_wait.h @@ -8,6 +8,7 @@ #include #include +#include /* * Structure allowing asynchronous waiting on RCU. @@ -55,4 +56,13 @@ do { \ #define synchronize_rcu_mult(...) \ _wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__) +static inline void cond_resched_rcu(void) +{ +#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); +#endif +} + #endif /* _LINUX_SCHED_RCUPDATE_WAIT_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index a588b94988bc4..814bfdafbc1ca 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -10,9 +10,14 @@ #include #include +#include +#include +#include +#include #include #include +#include #include #include #include @@ -23,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -2059,15 +2063,6 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock); __cond_resched_rwlock_write(lock); \ }) -static inline void cond_resched_rcu(void) -{ -#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) - rcu_read_unlock(); - cond_resched(); - rcu_read_lock(); -#endif -} - #ifdef CONFIG_PREEMPT_DYNAMIC extern bool preempt_model_none(void); diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 1880ae21a9cb7..538cdfbe895f9 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -7,6 +7,7 @@ * functionality: */ +#include #include #include #include -- cgit v1.2.3 From d6b9f4e6f7fb589d8024a31cc4883d15d0c8def4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 27 Dec 2023 09:23:05 +0000 Subject: block: rename and document BLK_DEF_MAX_SECTORS Give BLK_DEF_MAX_SECTORS a _CAP postfix and document what it is used for. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231227092305.279567-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bc236e77d85e1..94701a63ad8aa 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1056,7 +1056,14 @@ enum blk_default_limits { BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, }; -#define BLK_DEF_MAX_SECTORS 2560u +/* + * Default upper limit for the software max_sectors limit used for + * regular file system I/O. This can be increased through sysfs. + * + * Not to be confused with the max_hw_sector limit that is entirely + * controlled by the driver, usually based on hardware limits. + */ +#define BLK_DEF_MAX_SECTORS_CAP 2560u static inline unsigned long queue_segment_boundary(const struct request_queue *q) { -- cgit v1.2.3 From 21d706d5cf570917594b21edee81893bdce09ab8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 9 Jul 2021 08:41:17 +0100 Subject: netfs: Add support for DIO buffering Add a bvec array pointer and an iterator to netfs_io_request for either holding a copy of a DIO iterator or a list of all the bits of buffer pointed to by a DIO iterator. There are two problems: Firstly, if an iovec-class iov_iter is passed to ->read_iter() or ->write_iter(), this cannot be passed directly to kernel_sendmsg() or kernel_recvmsg() as that may cause locking recursion if a fault is generated, so we need to keep track of the pages involved separately. Secondly, if the I/O is asynchronous, we must copy the iov_iter describing the buffer before returning to the caller as it may be immediately deallocated. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 3da962e977f55..2bb1273b38f42 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -190,6 +190,8 @@ struct netfs_io_request { struct iov_iter iter; /* Unencrypted-side iterator */ struct iov_iter io_iter; /* I/O (Encrypted-side) iterator */ void *netfs_priv; /* Private data for the netfs */ + struct bio_vec *direct_bv; /* DIO buffer list (when handling iovec-iter) */ + unsigned int direct_bv_count; /* Number of elements in direct_bv[] */ unsigned int debug_id; atomic_t nr_outstanding; /* Number of ops in progress */ atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */ @@ -197,6 +199,7 @@ struct netfs_io_request { size_t len; /* Length of the request */ short error; /* 0 or error that occurred */ enum netfs_io_origin origin; /* Origin of the request */ + bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */ loff_t i_size; /* Size of the file */ loff_t start; /* Start position */ pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ -- cgit v1.2.3 From 7d828a06634799aba0fa392913c7fe2953eb64a6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 22 Sep 2023 13:25:22 +0100 Subject: netfs: Provide tools to create a buffer in an xarray Provide tools to create a buffer in an xarray, with a function to add new folios with a mark. This will be used to create bounce buffer and can be used more easily to create a list of folios the span of which would require more than a page's worth of bio_vec structs. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 2bb1273b38f42..c05365e3f4281 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -109,6 +109,10 @@ static inline int wait_on_page_fscache_killable(struct page *page) return folio_wait_private_2_killable(page_folio(page)); } +/* Marks used on xarray-based buffers */ +#define NETFS_BUF_PUT_MARK XA_MARK_0 /* - Page needs putting */ +#define NETFS_BUF_PAGECACHE_MARK XA_MARK_1 /* - Page needs wb/dirty flag wrangling */ + enum netfs_io_source { NETFS_FILL_WITH_ZEROES, NETFS_DOWNLOAD_FROM_SERVER, -- cgit v1.2.3 From cae932d3aee55035a54415dcea8e7ecf2ec469b5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 22 Sep 2023 14:49:47 +0100 Subject: netfs: Add func to calculate pagecount/size-limited span of an iterator Add a function to work out how much of an ITER_BVEC or ITER_XARRAY iterator we can use in a pagecount-limited and size-limited span. This will be used, for example, to limit the number of segments in a subrequest to the maximum number of elements that an RDMA transfer can handle. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index c05365e3f4281..d673d0785b9d9 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -321,6 +321,8 @@ void netfs_put_subrequest(struct netfs_io_subrequest *subreq, ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, struct iov_iter *new, iov_iter_extraction_t extraction_flags); +size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset, + size_t max_size, size_t max_segs); int netfs_start_io_read(struct inode *inode); void netfs_end_io_read(struct inode *inode); -- cgit v1.2.3 From 768ddb1eacf5dd997ecf393e7bab9796bad047e0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 27 May 2022 13:45:28 +0100 Subject: netfs: Limit subrequest by size or number of segments Limit a subrequest to a maximum size and/or a maximum number of contiguous physical regions. This permits, for instance, an subreq's iterator to be limited to the number of DMA'able segments that a large RDMA request can handle. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index d673d0785b9d9..44cd13ad695af 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -161,6 +161,7 @@ struct netfs_io_subrequest { refcount_t ref; short error; /* 0 or error that occurred */ unsigned short debug_index; /* Index in list (for debugging output) */ + unsigned int max_nr_segs; /* 0 or max number of segments in an iterator */ enum netfs_io_source source; /* Where to read from/write to */ unsigned long flags; #define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */ -- cgit v1.2.3 From 16af134ca4b7051b1587108f2066ec90ae029f74 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 9 Feb 2022 19:52:13 +0000 Subject: netfs: Extend the netfs_io_*request structs to handle writes Modify the netfs_io_request struct to act as a point around which writes can be coordinated. It represents and pins a range of pages that need writing and a list of regions of dirty data in that range of pages. If RMW is required, the original data can be downloaded into the bounce buffer, decrypted if necessary, the modifications made, then the modified data can be reencrypted/recompressed and sent back to the server. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 44cd13ad695af..f302123a3e384 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -118,6 +118,9 @@ enum netfs_io_source { NETFS_DOWNLOAD_FROM_SERVER, NETFS_READ_FROM_CACHE, NETFS_INVALID_READ, + NETFS_UPLOAD_TO_SERVER, + NETFS_WRITE_TO_CACHE, + NETFS_INVALID_WRITE, } __mode(byte); typedef void (*netfs_io_terminated_t)(void *priv, ssize_t transferred_or_error, @@ -149,9 +152,14 @@ struct netfs_cache_resources { }; /* - * Descriptor for a single component subrequest. + * Descriptor for a single component subrequest. Each operation represents an + * individual read/write from/to a server, a cache, a journal, etc.. + * + * The buffer iterator is persistent for the life of the subrequest struct and + * the pages it points to can be relied on to exist for the duration. */ struct netfs_io_subrequest { + struct work_struct work; struct netfs_io_request *rreq; /* Supervising I/O request */ struct list_head rreq_link; /* Link in rreq->subrequests */ struct iov_iter io_iter; /* Iterator for this subrequest */ @@ -176,6 +184,8 @@ enum netfs_io_origin { NETFS_READAHEAD, /* This read was triggered by readahead */ NETFS_READPAGE, /* This read is a synchronous read */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ + NETFS_WRITEBACK, /* This write was triggered by writepages */ + nr__netfs_io_origin } __mode(byte); /* @@ -198,6 +208,7 @@ struct netfs_io_request { struct bio_vec *direct_bv; /* DIO buffer list (when handling iovec-iter) */ unsigned int direct_bv_count; /* Number of elements in direct_bv[] */ unsigned int debug_id; + unsigned int subreq_counter; /* Next subreq->debug_index */ atomic_t nr_outstanding; /* Number of ops in progress */ atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */ size_t submitted; /* Amount submitted for I/O so far */ @@ -216,6 +227,8 @@ struct netfs_io_request { #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ #define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes */ +#define NETFS_RREQ_WRITE_TO_CACHE 7 /* Need to write to the cache */ +#define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */ const struct netfs_request_ops *netfs_ops; }; -- cgit v1.2.3 From c6dc54dd91bbf597942b4975b8adec660a16827d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 25 Feb 2022 12:27:53 +0000 Subject: netfs: Add a hook to allow tell the netfs to update its i_size Add a hook for netfslib's write helpers to call to tell the network filesystem that it should update its i_size. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index f302123a3e384..3fc41f616621b 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -242,6 +242,7 @@ struct netfs_request_ops { void (*free_request)(struct netfs_io_request *rreq); void (*free_subrequest)(struct netfs_io_subrequest *rreq); + /* Read request handling */ void (*expand_readahead)(struct netfs_io_request *rreq); bool (*clamp_length)(struct netfs_io_subrequest *subreq); void (*issue_read)(struct netfs_io_subrequest *subreq); @@ -249,6 +250,9 @@ struct netfs_request_ops { int (*check_write_begin)(struct file *file, loff_t pos, unsigned len, struct folio **foliop, void **_fsdata); void (*done)(struct netfs_io_request *rreq); + + /* Modification handling */ + void (*update_i_size)(struct inode *inode, loff_t i_size); }; /* -- cgit v1.2.3 From 9ebff83e648148b9ece97d4e4890dd84ca54d6ce Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 29 Sep 2023 17:28:25 +0100 Subject: netfs: Prep to use folio->private for write grouping and streaming write Prepare to use folio->private to hold information write grouping and streaming write. These are implemented in the same commit as they both make use of folio->private and will be both checked at the same time in several places. "Write grouping" involves ordering the writeback of groups of writes, such as is needed for ceph snaps. A group is represented by a filesystem-supplied object which must contain a netfs_group struct. This contains just a refcount and a pointer to a destructor. "Streaming write" is the storage of data in folios that are marked dirty, but not uptodate, to avoid unnecessary reads of data. This is represented by a netfs_folio struct. This contains the offset and length of the modified region plus the otherwise displaced write grouping pointer. The way folio->private is multiplexed is: (1) If private is NULL then neither is in operation on a dirty folio. (2) If private is set, with bit 0 clear, then this points to a group. (3) If private is set, with bit 0 set, then this points to a netfs_folio struct (with bit 0 AND'ed out). Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 3fc41f616621b..cfba83e3e3d2a 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -140,6 +140,47 @@ struct netfs_inode { #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ }; +/* + * A netfs group - for instance a ceph snap. This is marked on dirty pages and + * pages marked with a group must be flushed before they can be written under + * the domain of another group. + */ +struct netfs_group { + refcount_t ref; + void (*free)(struct netfs_group *netfs_group); +}; + +/* + * Information about a dirty page (attached only if necessary). + * folio->private + */ +struct netfs_folio { + struct netfs_group *netfs_group; /* Filesystem's grouping marker (or NULL). */ + unsigned int dirty_offset; /* Write-streaming dirty data offset */ + unsigned int dirty_len; /* Write-streaming dirty data length */ +}; +#define NETFS_FOLIO_INFO 0x1UL /* OR'd with folio->private. */ + +static inline struct netfs_folio *netfs_folio_info(struct folio *folio) +{ + void *priv = folio_get_private(folio); + + if ((unsigned long)priv & NETFS_FOLIO_INFO) + return (struct netfs_folio *)((unsigned long)priv & ~NETFS_FOLIO_INFO); + return NULL; +} + +static inline struct netfs_group *netfs_folio_group(struct folio *folio) +{ + struct netfs_folio *finfo; + void *priv = folio_get_private(folio); + + finfo = netfs_folio_info(folio); + if (finfo) + return finfo->netfs_group; + return priv; +} + /* * Resources required to do operations on a cache. */ -- cgit v1.2.3 From 0e0f2dfe880fb19e4b15a7ca468623eb0b4ba586 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 29 Jun 2021 22:31:48 +0100 Subject: netfs: Dispatch write requests to process a writeback slice Dispatch one or more write reqeusts to process a writeback slice, where a slice is tailored more to logical block divisions within the file (such as crypto blocks, an object layout or cache granules) than the protocol RPC maximum capacity. The dispatch doesn't happen until throttling allows, at which point the entire writeback slice is processed and queued. A slice may be written to multiple destinations (one or more servers and the local cache) and the writes to each destination might be split up along different lines. The writeback slice holds the required folios pinned. An iov_iter is provided in netfs_write_request that describes the buffer to be used. This may be part of the pagecache, may have auxiliary padding pages attached or may be a bounce buffer resulting from crypto or compression. Consequently, the filesystem must not twiddle the folio markings directly. The following API is available to the filesystem: (1) The ->create_write_requests() method is called to ask the filesystem to create the requests it needs. This is passed the writeback slice to be processed. (2) The filesystem should then call netfs_create_write_request() to create the requests it needs. (3) Once a request is initialised, netfs_queue_write_request() can be called to dispatch it asynchronously, if not completed immediately. (4) netfs_write_request_completed() should be called to note the completion of a request. (5) netfs_get_write_request() and netfs_put_write_request() are provided to refcount a request. These take constants from the netfs_wreq_trace enum for logging into ftrace. (6) The ->free_write_request is method is called to ask the filesystem to clean up a request. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index cfba83e3e3d2a..890a5d8b22992 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -249,6 +249,7 @@ struct netfs_io_request { struct bio_vec *direct_bv; /* DIO buffer list (when handling iovec-iter) */ unsigned int direct_bv_count; /* Number of elements in direct_bv[] */ unsigned int debug_id; + unsigned int wsize; /* Maximum write size (0 for none) */ unsigned int subreq_counter; /* Next subreq->debug_index */ atomic_t nr_outstanding; /* Number of ops in progress */ atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */ @@ -271,6 +272,7 @@ struct netfs_io_request { #define NETFS_RREQ_WRITE_TO_CACHE 7 /* Need to write to the cache */ #define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */ const struct netfs_request_ops *netfs_ops; + void (*cleanup)(struct netfs_io_request *req); }; /* @@ -294,6 +296,11 @@ struct netfs_request_ops { /* Modification handling */ void (*update_i_size)(struct inode *inode, loff_t i_size); + + /* Write request handling */ + void (*create_write_requests)(struct netfs_io_request *wreq, + loff_t start, size_t len); + void (*invalidate_cache)(struct netfs_io_request *wreq); }; /* @@ -382,6 +389,12 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len, iov_iter_extraction_t extraction_flags); size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset, size_t max_size, size_t max_segs); +struct netfs_io_subrequest *netfs_create_write_request( + struct netfs_io_request *wreq, enum netfs_io_source dest, + loff_t start, size_t len, work_func_t worker); +void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, + bool was_async); +void netfs_queue_write_request(struct netfs_io_subrequest *subreq); int netfs_start_io_read(struct inode *inode); void netfs_end_io_read(struct inode *inode); -- cgit v1.2.3 From c38f4e96e605f17990e871214e6ea1496bc4e65f Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 17 Jun 2021 13:09:21 +0100 Subject: netfs: Provide func to copy data to pagecache for buffered write Provide a netfs write helper, netfs_perform_write() to buffer data to be written in the pagecache and mark the modified folios dirty. It will perform "streaming writes" for folios that aren't currently resident, if possible, storing data in partially modified folios that are marked dirty, but not uptodate. It will also tag pages as belonging to fs-specific write groups if so directed by the filesystem. This is derived from generic_perform_write(), but doesn't use ->write_begin() and ->write_end(), having that logic rolled in instead. Signed-off-by: David Howells cc: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 890a5d8b22992..70f578cf3715f 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -367,6 +367,11 @@ struct netfs_cache_ops { loff_t *_data_start, size_t *_data_len); }; +/* High-level write API */ +ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, + struct netfs_group *netfs_group); + +/* Address operations API */ struct readahead_control; void netfs_readahead(struct readahead_control *); int netfs_read_folio(struct file *, struct folio *); -- cgit v1.2.3 From 016dc8516aec8719641e7aaaacd78d344759178e Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Jan 2022 17:39:55 +0000 Subject: netfs: Implement unbuffered/DIO read support Implement support for unbuffered and DIO reads in the netfs library, utilising the existing read helper code to do block splitting and individual queuing. The code also handles extraction of the destination buffer from the supplied iterator, allowing async unbuffered reads to take place. The read will be split up according to the rsize setting and, if supplied, the ->clamp_length() method. Note that the next subrequest will be issued as soon as issue_op returns, without waiting for previous ones to finish. The network filesystem needs to pause or handle queuing them if it doesn't want to fire them all at the server simultaneously. Once all the subrequests have finished, the state will be assessed and the amount of data to be indicated as having being obtained will be determined. As the subrequests may finish in any order, if an intermediate subrequest is short, any further subrequests may be copied into the buffer and then abandoned. In the future, this will also take care of doing an unbuffered read from encrypted content, with the decryption being done by the library. Signed-off-by: David Howells cc: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 70f578cf3715f..7c13095684598 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -226,6 +226,7 @@ enum netfs_io_origin { NETFS_READPAGE, /* This read is a synchronous read */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ NETFS_WRITEBACK, /* This write was triggered by writepages */ + NETFS_DIO_READ, /* This is a direct I/O read */ nr__netfs_io_origin } __mode(byte); @@ -240,6 +241,7 @@ struct netfs_io_request { }; struct inode *inode; /* The file being accessed */ struct address_space *mapping; /* The mapping being accessed */ + struct kiocb *iocb; /* AIO completion vector */ struct netfs_cache_resources cache_resources; struct list_head proc_link; /* Link in netfs_iorequests */ struct list_head subrequests; /* Contributory I/O operations */ @@ -249,12 +251,14 @@ struct netfs_io_request { struct bio_vec *direct_bv; /* DIO buffer list (when handling iovec-iter) */ unsigned int direct_bv_count; /* Number of elements in direct_bv[] */ unsigned int debug_id; + unsigned int rsize; /* Maximum read size (0 for none) */ unsigned int wsize; /* Maximum write size (0 for none) */ unsigned int subreq_counter; /* Next subreq->debug_index */ atomic_t nr_outstanding; /* Number of ops in progress */ atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */ size_t submitted; /* Amount submitted for I/O so far */ size_t len; /* Length of the request */ + size_t transferred; /* Amount to be indicated as transferred */ short error; /* 0 or error that occurred */ enum netfs_io_origin origin; /* Origin of the request */ bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */ @@ -271,6 +275,8 @@ struct netfs_io_request { #define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes */ #define NETFS_RREQ_WRITE_TO_CACHE 7 /* Need to write to the cache */ #define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */ +#define NETFS_RREQ_NONBLOCK 9 /* Don't block if possible (O_NONBLOCK) */ +#define NETFS_RREQ_BLOCKED 10 /* We blocked */ const struct netfs_request_ops *netfs_ops; void (*cleanup)(struct netfs_io_request *req); }; @@ -367,6 +373,9 @@ struct netfs_cache_ops { loff_t *_data_start, size_t *_data_len); }; +/* High-level read API. */ +ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter); + /* High-level write API */ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, struct netfs_group *netfs_group); -- cgit v1.2.3 From 153a9961b551101cd38e94e26cd92fbfd198b19b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 21 Feb 2022 11:38:17 +0000 Subject: netfs: Implement unbuffered/DIO write support Implement support for unbuffered writes and direct I/O writes. If the write is misaligned with respect to the fscrypt block size, then RMW cycles are performed if necessary. DIO writes are a special case of unbuffered writes with extra restriction imposed, such as block size alignment requirements. Also provide a field that can tell the code to add some extra space onto the bounce buffer for use by the filesystem in the case of a content-encrypted file. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 7c13095684598..e1dfd6775c2c3 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -138,6 +138,7 @@ struct netfs_inode { loff_t remote_i_size; /* Size of the remote file */ unsigned long flags; #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ +#define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ }; /* @@ -226,7 +227,9 @@ enum netfs_io_origin { NETFS_READPAGE, /* This read is a synchronous read */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ NETFS_WRITEBACK, /* This write was triggered by writepages */ + NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */ NETFS_DIO_READ, /* This is a direct I/O read */ + NETFS_DIO_WRITE, /* This is a direct I/O write */ nr__netfs_io_origin } __mode(byte); @@ -379,6 +382,7 @@ ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter); /* High-level write API */ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, struct netfs_group *netfs_group); +ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from); /* Address operations API */ struct readahead_control; -- cgit v1.2.3 From 938e13a73b244278a3777f38fa915bd239b2efd2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 17 Jun 2021 13:09:21 +0100 Subject: netfs: Implement buffered write API Institute a netfs write helper, netfs_file_write_iter(), to be pointed at by the network filesystem ->write_iter() call. Make it handled buffered writes by calling the previously defined netfs_perform_write() to copy the source data into the pagecache. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index e1dfd6775c2c3..0948ecf69aa5d 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -382,7 +382,10 @@ ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter); /* High-level write API */ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, struct netfs_group *netfs_group); +ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from, + struct netfs_group *netfs_group); ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from); +ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from); /* Address operations API */ struct readahead_control; -- cgit v1.2.3 From 102a7e2c598c22bd2621fa97eb1c93c89d469a12 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 15 Feb 2022 23:15:57 +0000 Subject: netfs: Allow buffered shared-writeable mmap through netfs_page_mkwrite() Provide an entry point to delegate a filesystem's ->page_mkwrite() to. This checks for conflicting writes, then attached any netfs-specific group marking (e.g. ceph snap) to the page to be considered dirty. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 0948ecf69aa5d..d7f324c7c22ae 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -400,6 +400,10 @@ void netfs_clear_inode_writeback(struct inode *inode, const void *aux); void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length); bool netfs_release_folio(struct folio *folio, gfp_t gfp); +/* VMA operations API. */ +vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group); + +/* (Sub)request management API. */ void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool); void netfs_get_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what); -- cgit v1.2.3 From 80645bd4aa33a5c325f11b8dc6b38b38410ad5c0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 11 Oct 2023 09:29:43 +0100 Subject: netfs: Provide netfs_file_read_iter() Provide a top-level-ish function that can be pointed to directly by ->read_iter file op. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index d7f324c7c22ae..19a41c437af3f 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -378,6 +378,8 @@ struct netfs_cache_ops { /* High-level read API. */ ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter); +ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter); +ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); /* High-level write API */ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, -- cgit v1.2.3 From e0ace6ca98bef0d8d354040f13ffc0a498813ee9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 22 Nov 2023 17:18:17 +0000 Subject: netfs, cachefiles: Pass upper bound length to allow expansion Make netfslib pass the maximum length to the ->prepare_write() op to tell the cache how much it can expand the length of a write to. This allows a write to the server at the end of a file to be limited to a few bytes whilst writing an entire block to the cache (something required by direct I/O). Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 19a41c437af3f..2856389f4694c 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -261,6 +261,7 @@ struct netfs_io_request { atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */ size_t submitted; /* Amount submitted for I/O so far */ size_t len; /* Length of the request */ + size_t upper_len; /* Length can be extended to here */ size_t transferred; /* Amount to be indicated as transferred */ short error; /* 0 or error that occurred */ enum netfs_io_origin origin; /* Origin of the request */ @@ -357,8 +358,8 @@ struct netfs_cache_ops { * actually do. */ int (*prepare_write)(struct netfs_cache_resources *cres, - loff_t *_start, size_t *_len, loff_t i_size, - bool no_space_allocated_yet); + loff_t *_start, size_t *_len, size_t upper_len, + loff_t i_size, bool no_space_allocated_yet); /* Prepare an on-demand read operation, shortening it to a cached/uncached * boundary as appropriate. -- cgit v1.2.3 From 62c3b7481b9a108cb99ef9438dba66bb4738768b Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 28 Sep 2023 11:46:49 +0100 Subject: netfs: Provide a writepages implementation Provide an implementation of writepages for network filesystems to delegate to. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 2856389f4694c..86bb8cb7f8d08 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -397,6 +397,8 @@ int netfs_read_folio(struct file *, struct folio *); int netfs_write_begin(struct netfs_inode *, struct file *, struct address_space *, loff_t pos, unsigned int len, struct folio **, void **fsdata); +int netfs_writepages(struct address_space *mapping, + struct writeback_control *wbc); bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio); int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc); void netfs_clear_inode_writeback(struct inode *inode, const void *aux); -- cgit v1.2.3 From 4a79616cfb27d76947ea37f0336745ef929d56be Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 5 Oct 2023 16:52:58 +0100 Subject: netfs: Provide a launder_folio implementation Provide a launder_folio implementation for netfslib. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 86bb8cb7f8d08..29c66acad9256 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -227,6 +227,7 @@ enum netfs_io_origin { NETFS_READPAGE, /* This read is a synchronous read */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ NETFS_WRITEBACK, /* This write was triggered by writepages */ + NETFS_LAUNDER_WRITE, /* This is triggered by ->launder_folio() */ NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */ NETFS_DIO_READ, /* This is a direct I/O read */ NETFS_DIO_WRITE, /* This is a direct I/O write */ @@ -404,6 +405,7 @@ int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc); void netfs_clear_inode_writeback(struct inode *inode, const void *aux); void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length); bool netfs_release_folio(struct folio *folio, gfp_t gfp); +int netfs_launder_folio(struct folio *folio); /* VMA operations API. */ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group); -- cgit v1.2.3 From 41d8e7673a7726cba57cb8112d81c89cfb6c3e35 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 12 Oct 2023 09:06:24 +0100 Subject: netfs: Implement a write-through caching option Provide a flag whereby a filesystem may request that cifs_perform_write() perform write-through caching. This involves putting pages directly into writeback rather than dirty and attaching them to a write operation as we go. Further, the writes being made are limited to the byte range being written rather than whole folios being written. This can be used by cifs, for example, to deal with strict byte-range locking. This can't be used with content encryption as that may require expansion of the write RPC beyond the write being made. This doesn't affect writes via mmap - those are written back in the normal way; similarly failed writethrough writes are marked dirty and left to writeback to retry. Another option would be to simply invalidate them, but the contents can be simultaneously accessed by read() and through mmap. Signed-off-by: David Howells cc: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 29c66acad9256..8a2dd882a7814 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -139,6 +139,7 @@ struct netfs_inode { unsigned long flags; #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ +#define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ }; /* @@ -227,6 +228,7 @@ enum netfs_io_origin { NETFS_READPAGE, /* This read is a synchronous read */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ NETFS_WRITEBACK, /* This write was triggered by writepages */ + NETFS_WRITETHROUGH, /* This write was made by netfs_perform_write() */ NETFS_LAUNDER_WRITE, /* This is triggered by ->launder_folio() */ NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */ NETFS_DIO_READ, /* This is a direct I/O read */ -- cgit v1.2.3 From 100ccd18bb41ea7abb4fbb419202c06079559501 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Nov 2023 13:39:02 +0000 Subject: netfs: Optimise away reads above the point at which there can be no data Track the file position above which the server is not expected to have any data (the "zero point") and preemptively assume that we can satisfy requests by filling them with zeroes locally rather than attempting to download them if they're over that line - even if we've written data back to the server. Assume that any data that was written back above that position is held in the local cache. Note that we have to split requests that straddle the line. Make use of this to optimise away some reads from the server. We need to set the zero point in the following circumstances: (1) When we see an extant remote inode and have no cache for it, we set the zero_point to i_size. (2) On local inode creation, we set zero_point to 0. (3) On local truncation down, we reduce zero_point to the new i_size if the new i_size is lower. (4) On local truncation up, we don't change zero_point. (5) On local modification, we don't change zero_point. (6) On remote invalidation, we set zero_point to the new i_size. (7) If stored data is discarded from the pagecache or culled from fscache, we must set zero_point above that if the data also got written to the server. (8) If dirty data is written back to the server, but not fscache, we must set zero_point above that. (9) If a direct I/O write is made, set zero_point above that. Assuming the above, any read from the server at or above the zero_point position will return all zeroes. The zero_point value can be stored in the cache, provided the above rules are applied to it by any code that culls part of the local cache. Signed-off-by: David Howells cc: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 8a2dd882a7814..852956aa3c4bb 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -136,6 +136,8 @@ struct netfs_inode { struct fscache_cookie *cache; #endif loff_t remote_i_size; /* Size of the remote file */ + loff_t zero_point; /* Size after which we assume there's no data + * on the server */ unsigned long flags; #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ @@ -453,31 +455,44 @@ static inline struct netfs_inode *netfs_inode(struct inode *inode) * netfs_inode_init - Initialise a netfslib inode context * @ctx: The netfs inode to initialise * @ops: The netfs's operations list + * @use_zero_point: True to use the zero_point read optimisation * * Initialise the netfs library context struct. This is expected to follow on * directly from the VFS inode struct. */ static inline void netfs_inode_init(struct netfs_inode *ctx, - const struct netfs_request_ops *ops) + const struct netfs_request_ops *ops, + bool use_zero_point) { ctx->ops = ops; ctx->remote_i_size = i_size_read(&ctx->inode); + ctx->zero_point = LLONG_MAX; ctx->flags = 0; #if IS_ENABLED(CONFIG_FSCACHE) ctx->cache = NULL; #endif + /* ->releasepage() drives zero_point */ + if (use_zero_point) { + ctx->zero_point = ctx->remote_i_size; + mapping_set_release_always(ctx->inode.i_mapping); + } } /** * netfs_resize_file - Note that a file got resized * @ctx: The netfs inode being resized * @new_i_size: The new file size + * @changed_on_server: The change was applied to the server * * Inform the netfs lib that a file got resized so that it can adjust its state. */ -static inline void netfs_resize_file(struct netfs_inode *ctx, loff_t new_i_size) +static inline void netfs_resize_file(struct netfs_inode *ctx, loff_t new_i_size, + bool changed_on_server) { - ctx->remote_i_size = new_i_size; + if (changed_on_server) + ctx->remote_i_size = new_i_size; + if (new_i_size < ctx->zero_point) + ctx->zero_point = new_i_size; } /** -- cgit v1.2.3 From c39e2ae3943d4ee278af4e1b1dcfd5946da1089b Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 28 Dec 2023 11:06:08 +0100 Subject: fs: fix __sb_write_started() kerneldoc formatting When running 'make htmldocs', I see the following warning: Documentation/filesystems/api-summary:14: ./include/linux/fs.h:1659: WARNING: Definition list ends without a blank line; unexpected unindent. The official guidance [1] seems to be to use lists, which will prevent both the "unexpected unindent" warning as well as ensure that each line is formatted on a separate line in the HTML output instead of being all considered a single paragraph. [1]: https://docs.kernel.org/doc-guide/kernel-doc.html#return-values Fixes: 8802e580ee64 ("fs: create __sb_write_started() helper") Cc: Amir Goldstein Cc: Josef Bacik Cc: Jan Kara Signed-off-by: Vegard Nossum Link: https://lore.kernel.org/r/20231228100608.3123987-1-vegard.nossum@oracle.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index db5d07e6e02ee..473063f385e54 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1650,9 +1650,9 @@ static inline bool __sb_start_write_trylock(struct super_block *sb, int level) * @sb: the super we write to * @level: the freeze level * - * > 0 sb freeze level is held - * 0 sb freeze level is not held - * < 0 !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN + * * > 0 - sb freeze level is held + * * 0 - sb freeze level is not held + * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN */ static inline int __sb_write_started(const struct super_block *sb, int level) { -- cgit v1.2.3 From 0b68ab50b8101a35b51fb9ec203cd988e47dbed3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 23 Dec 2023 14:53:47 +0100 Subject: sysctl: delete unused define SYSCTL_PERM_EMPTY_DIR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It seems it was never used. Fixes: 2f2665c13af4 ("sysctl: replace child with an enumeration") Signed-off-by: Thomas Weißschuh Signed-off-by: Luis Chamberlain --- include/linux/sysctl.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 61b40ea81f4d3..26a38161c28f9 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -255,8 +255,6 @@ extern int unaligned_enabled; extern int unaligned_dump_stack; extern int no_unaligned_warning; -#define SYSCTL_PERM_EMPTY_DIR (1 << 0) - #else /* CONFIG_SYSCTL */ static inline void register_sysctl_init(const char *path, struct ctl_table *table) -- cgit v1.2.3 From 561429807d50aad76f1205b0b1d7b4aacf365d4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 20 Dec 2023 22:23:35 +0100 Subject: sysctl: remove struct ctl_path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All usages of this struct have been removed from the kernel tree. The struct is still referenced by scripts/check-sysctl-docs but that script is broken anyways as it only supports the register_sysctl_paths() API and not the currently used register_sysctl() one. Fixes: 0199849acd07 ("sysctl: remove register_sysctl_paths()") Signed-off-by: Thomas Weißschuh Reviewed-by: Joel Granados Signed-off-by: Luis Chamberlain --- include/linux/sysctl.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 26a38161c28f9..ee7d33b89e9ef 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -210,11 +210,6 @@ struct ctl_table_root { int (*permissions)(struct ctl_table_header *head, struct ctl_table *table); }; -/* struct ctl_path describes where in the hierarchy a table is added */ -struct ctl_path { - const char *procname; -}; - #define register_sysctl(path, table) \ register_sysctl_sz(path, table, ARRAY_SIZE(table)) -- cgit v1.2.3 From 4e814173a8c4f432fd068b1c796f0416328c9d99 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 18 Dec 2023 20:25:02 +0100 Subject: thermal: core: Fix thermal zone suspend-resume synchronization There are 3 synchronization issues with thermal zone suspend-resume during system-wide transitions: 1. The resume code runs in a PM notifier which is invoked after user space has been thawed, so it can run concurrently with user space which can trigger a thermal zone device removal. If that happens, the thermal zone resume code may use a stale pointer to the next list element and crash, because it does not hold thermal_list_lock while walking thermal_tz_list. 2. The thermal zone resume code calls thermal_zone_device_init() outside the zone lock, so user space or an update triggered by the platform firmware may see an inconsistent state of a thermal zone leading to unexpected behavior. 3. Clearing the in_suspend global variable in thermal_pm_notify() allows __thermal_zone_device_update() to continue for all thermal zones and it may as well run before the thermal_tz_list walk (or at any point during the list walk for that matter) and attempt to operate on a thermal zone that has not been resumed yet. It may also race destructively with thermal_zone_device_init(). To address these issues, add thermal_list_lock locking to thermal_pm_notify(), especially arount the thermal_tz_list, make it call thermal_zone_device_init() back-to-back with __thermal_zone_device_update() under the zone lock and replace in_suspend with per-zone bool "suspend" indicators set and unset under the given zone's lock. Link: https://lore.kernel.org/linux-pm/20231218162348.69101-1-bo.ye@mediatek.com/ Reported-by: Bo Ye Signed-off-by: Rafael J. Wysocki --- include/linux/thermal.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 09f6eb82c191c..d00622b64d502 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -152,6 +152,7 @@ struct thermal_cooling_device { * @node: node in thermal_tz_list (in thermal_core.c) * @poll_queue: delayed work for polling * @notify_event: Last notification event + * @suspended: thermal zone suspend indicator */ struct thermal_zone_device { int id; @@ -185,6 +186,7 @@ struct thermal_zone_device { struct list_head node; struct delayed_work poll_queue; enum thermal_notify_event notify_event; + bool suspended; }; /** -- cgit v1.2.3 From 753547de0daecbdbd1af3618987ddade325d9aaa Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 28 Dec 2023 11:36:03 +0100 Subject: linux/export: Ensure natural alignment of kcrctab array The ___kcrctab section holds an array of 32-bit CRC values. Add a .balign 4 to tell the linker the correct memory alignment. Fixes: f3304ecd7f06 ("linux/export: use inline assembler to populate symbol CRCs") Signed-off-by: Helge Deller Signed-off-by: Masahiro Yamada --- include/linux/export-internal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h index cd253eb51d6c0..d445705ac13c0 100644 --- a/include/linux/export-internal.h +++ b/include/linux/export-internal.h @@ -64,6 +64,7 @@ #define SYMBOL_CRC(sym, crc, sec) \ asm(".section \"___kcrctab" sec "+" #sym "\",\"a\"" "\n" \ + ".balign 4" "\n" \ "__crc_" #sym ":" "\n" \ ".long " #crc "\n" \ ".previous" "\n") -- cgit v1.2.3 From 1271ca00aa7f9bb3fd94cb7ac8f654de71099580 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Tue, 19 Dec 2023 16:55:31 -0700 Subject: ethtool: reformat kerneldoc for struct ethtool_fec_stats The kerneldoc comment for struct ethtool_fec_stats attempts to describe the "total" and "lanes" fields of the ethtool_fec_stat substructure in a way leading to these warnings: ./include/linux/ethtool.h:424: warning: Excess struct member 'lane' description in 'ethtool_fec_stats' ./include/linux/ethtool.h:424: warning: Excess struct member 'total' description in 'ethtool_fec_stats' Reformat the comment to retain the information while eliminating the warnings. Signed-off-by: Jonathan Corbet Reviewed-by: Randy Dunlap Signed-off-by: David S. Miller --- include/linux/ethtool.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index cfcd952a1d4f1..325e0778e9371 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -411,8 +411,10 @@ struct ethtool_pause_stats { * not entire FEC data blocks. This is a non-standard statistic. * Reported to user space as %ETHTOOL_A_FEC_STAT_CORR_BITS. * - * @lane: per-lane/PCS-instance counts as defined by the standard - * @total: error counts for the entire port, for drivers incapable of reporting + * For each of the above fields, the two substructure members are: + * + * - @lanes: per-lane/PCS-instance counts as defined by the standard + * - @total: error counts for the entire port, for drivers incapable of reporting * per-lane stats * * Drivers should fill in either only total or per-lane statistics, core -- cgit v1.2.3 From a8c959402d4dd6823918b33828d79900ae58c700 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 20 Dec 2023 23:17:45 +0000 Subject: thermal: core: Add governor callback for thermal zone change Add a new callback to the struct thermal_governor. It can be used for updating governors when there is a change in the thermal zone internals, e.g. thermal cooling device is bind to the thermal zone. That makes possible to move some heavy operations like memory allocations related to the number of cooling instances out of the throttle() callback. Both callback code paths (throttle() and update_tz()) are protected with the same thermal zone lock, which guaranties the consistency. Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/thermal.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index d00622b64d502..4d96fefb27679 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -51,6 +51,8 @@ enum thermal_notify_event { THERMAL_DEVICE_POWER_CAPABILITY_CHANGED, /* power capability changed */ THERMAL_TABLE_CHANGED, /* Thermal table(s) changed */ THERMAL_EVENT_KEEP_ALIVE, /* Request for user space handler to respond */ + THERMAL_TZ_BIND_CDEV, /* Cooling dev is bind to the thermal zone */ + THERMAL_TZ_UNBIND_CDEV, /* Cooling dev is unbind from the thermal zone */ }; /** @@ -199,6 +201,8 @@ struct thermal_zone_device { * thermal zone. * @throttle: callback called for every trip point even if temperature is * below the trip point temperature + * @update_tz: callback called when thermal zone internals have changed, e.g. + * thermal cooling instance was added/removed * @governor_list: node in thermal_governor_list (in thermal_core.c) */ struct thermal_governor { @@ -207,6 +211,8 @@ struct thermal_governor { void (*unbind_from_tz)(struct thermal_zone_device *tz); int (*throttle)(struct thermal_zone_device *tz, const struct thermal_trip *trip); + void (*update_tz)(struct thermal_zone_device *tz, + enum thermal_notify_event reason); struct list_head governor_list; }; -- cgit v1.2.3 From bfc57bd1685981730bfe9802d9de7603a0a43bc4 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 20 Dec 2023 23:17:52 +0000 Subject: thermal/sysfs: Update governors when the 'weight' has changed Support governors update when the thermal instance's weight has changed. This allows to adjust internal state for the governor. Signed-off-by: Lukasz Luba [ rjw: Add two empty code lines aroung the locking ] Signed-off-by: Rafael J. Wysocki --- include/linux/thermal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 4d96fefb27679..9d0427da32af5 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -53,6 +53,7 @@ enum thermal_notify_event { THERMAL_EVENT_KEEP_ALIVE, /* Request for user space handler to respond */ THERMAL_TZ_BIND_CDEV, /* Cooling dev is bind to the thermal zone */ THERMAL_TZ_UNBIND_CDEV, /* Cooling dev is unbind from the thermal zone */ + THERMAL_INSTANCE_WEIGHT_CHANGED, /* Thermal instance weight changed */ }; /** -- cgit v1.2.3 From adef440691bab824e39c1b17382322d195e1fab0 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 6 Dec 2023 02:36:56 -0800 Subject: userfaultfd: UFFDIO_MOVE uABI Implement the uABI of UFFDIO_MOVE ioctl. UFFDIO_COPY performs ~20% better than UFFDIO_MOVE when the application needs pages to be allocated [1]. However, with UFFDIO_MOVE, if pages are available (in userspace) for recycling, as is usually the case in heap compaction algorithms, then we can avoid the page allocation and memcpy (done by UFFDIO_COPY). Also, since the pages are recycled in the userspace, we avoid the need to release (via madvise) the pages back to the kernel [2]. We see over 40% reduction (on a Google pixel 6 device) in the compacting thread's completion time by using UFFDIO_MOVE vs. UFFDIO_COPY. This was measured using a benchmark that emulates a heap compaction implementation using userfaultfd (to allow concurrent accesses by application threads). More details of the usecase are explained in [2]. Furthermore, UFFDIO_MOVE enables moving swapped-out pages without touching them within the same vma. Today, it can only be done by mremap, however it forces splitting the vma. [1] https://lore.kernel.org/all/1425575884-2574-1-git-send-email-aarcange@redhat.com/ [2] https://lore.kernel.org/linux-mm/CA+EESO4uO84SSnBhArH4HvLNhaUQ5nZKNKXqxRCyjniNVjp0Aw@mail.gmail.com/ Update for the ioctl_userfaultfd(2) manpage: UFFDIO_MOVE (Since Linux xxx) Move a continuous memory chunk into the userfault registered range and optionally wake up the blocked thread. The source and destination addresses and the number of bytes to move are specified by the src, dst, and len fields of the uffdio_move structure pointed to by argp: struct uffdio_move { __u64 dst; /* Destination of move */ __u64 src; /* Source of move */ __u64 len; /* Number of bytes to move */ __u64 mode; /* Flags controlling behavior of move */ __s64 move; /* Number of bytes moved, or negated error */ }; The following value may be bitwise ORed in mode to change the behavior of the UFFDIO_MOVE operation: UFFDIO_MOVE_MODE_DONTWAKE Do not wake up the thread that waits for page-fault resolution UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES Allow holes in the source virtual range that is being moved. When not specified, the holes will result in ENOENT error. When specified, the holes will be accounted as successfully moved memory. This is mostly useful to move hugepage aligned virtual regions without knowing if there are transparent hugepages in the regions or not, but preventing the risk of having to split the hugepage during the operation. The move field is used by the kernel to return the number of bytes that was actually moved, or an error (a negated errno- style value). If the value returned in move doesn't match the value that was specified in len, the operation fails with the error EAGAIN. The move field is output-only; it is not read by the UFFDIO_MOVE operation. The operation may fail for various reasons. Usually, remapping of pages that are not exclusive to the given process fail; once KSM might deduplicate pages or fork() COW-shares pages during fork() with child processes, they are no longer exclusive. Further, the kernel might only perform lightweight checks for detecting whether the pages are exclusive, and return -EBUSY in case that check fails. To make the operation more likely to succeed, KSM should be disabled, fork() should be avoided or MADV_DONTFORK should be configured for the source VMA before fork(). This ioctl(2) operation returns 0 on success. In this case, the entire area was moved. On error, -1 is returned and errno is set to indicate the error. Possible errors include: EAGAIN The number of bytes moved (i.e., the value returned in the move field) does not equal the value that was specified in the len field. EINVAL Either dst or len was not a multiple of the system page size, or the range specified by src and len or dst and len was invalid. EINVAL An invalid bit was specified in the mode field. ENOENT The source virtual memory range has unmapped holes and UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES is not set. EEXIST The destination virtual memory range is fully or partially mapped. EBUSY The pages in the source virtual memory range are either pinned or not exclusive to the process. The kernel might only perform lightweight checks for detecting whether the pages are exclusive. To make the operation more likely to succeed, KSM should be disabled, fork() should be avoided or MADV_DONTFORK should be configured for the source virtual memory area before fork(). ENOMEM Allocating memory needed for the operation failed. ESRCH The target process has exited at the time of a UFFDIO_MOVE operation. Link: https://lkml.kernel.org/r/20231206103702.3873743-3-surenb@google.com Signed-off-by: Andrea Arcangeli Signed-off-by: Suren Baghdasaryan Cc: Al Viro Cc: Axel Rasmussen Cc: Brian Geffon Cc: Christian Brauner Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jann Horn Cc: Kalesh Singh Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport (IBM) Cc: Nicolas Geoffray Cc: Peter Xu Cc: Ryan Roberts Cc: Shuah Khan Cc: ZhangPeng Signed-off-by: Andrew Morton --- include/linux/rmap.h | 5 +++++ include/linux/userfaultfd_k.h | 11 +++++++++++ 2 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 3c2fc291b071d..af6a32b6f3e7b 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -121,6 +121,11 @@ static inline void anon_vma_lock_write(struct anon_vma *anon_vma) down_write(&anon_vma->root->rwsem); } +static inline int anon_vma_trylock_write(struct anon_vma *anon_vma) +{ + return down_write_trylock(&anon_vma->root->rwsem); +} + static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) { up_write(&anon_vma->root->rwsem); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index f2dc19f40d059..e4056547fbe61 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -93,6 +93,17 @@ extern int mwriteprotect_range(struct mm_struct *dst_mm, extern long uffd_wp_range(struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); +/* move_pages */ +void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2); +void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2); +ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm, + unsigned long dst_start, unsigned long src_start, + unsigned long len, __u64 flags); +int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, + unsigned long dst_addr, unsigned long src_addr); + /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, struct vm_userfaultfd_ctx vm_ctx) -- cgit v1.2.3 From 96db66d9c8f3c1547325af01b1f328b85d6ee1b9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 11 Dec 2023 16:22:06 +0000 Subject: mm: convert ksm_might_need_to_copy() to work on folios Patch series "Finish two folio conversions". Most callers of page_add_new_anon_rmap() and lru_cache_add_inactive_or_unevictable() have been converted to their folio equivalents, but there are still a few stragglers. There's a bit of preparatory work in ksm and unuse_pte(), but after that it's pretty mechanical. This patch (of 9): Accept a folio as an argument and return a folio result. Removes a call to compound_head() in do_swap_page(), and prevents folio & page from getting out of sync in unuse_pte(). Reviewed-by: David Hildenbrand [willy@infradead.org: fix smatch warning] Link: https://lkml.kernel.org/r/ZXnPtblC6A1IkyAB@casper.infradead.org [david@redhat.com: only adjust the page if the folio changed] Link: https://lkml.kernel.org/r/6a8f2110-fa91-4c10-9eae-88315309a6e3@redhat.com Link: https://lkml.kernel.org/r/20231211162214.2146080-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231211162214.2146080-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/ksm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 4643d5244e77c..401348e9f92b4 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -76,7 +76,7 @@ static inline void ksm_exit(struct mm_struct *mm) * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, * but what if the vma was unmerged while the page was swapped out? */ -struct page *ksm_might_need_to_copy(struct page *page, +struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr); void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); @@ -129,10 +129,10 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, return 0; } -static inline struct page *ksm_might_need_to_copy(struct page *page, +static inline struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { - return page; + return folio; } static inline void rmap_walk_ksm(struct folio *folio, -- cgit v1.2.3 From cafa8e37a2ebd344ae0774324c21f46640bbaab3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 11 Dec 2023 16:22:14 +0000 Subject: mm: remove page_add_new_anon_rmap and lru_cache_add_inactive_or_unevictable All callers have now been converted to folio_add_new_anon_rmap() and folio_add_lru_vma() so we can remove the wrapper. Link: https://lkml.kernel.org/r/20231211162214.2146080-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 -- include/linux/swap.h | 3 --- 2 files changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index af6a32b6f3e7b..0ae2bb0e77f5d 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -197,8 +197,6 @@ typedef int __bitwise rmap_t; void folio_move_anon_rmap(struct folio *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); -void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long address); void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); void page_add_file_rmap(struct page *, struct vm_area_struct *, diff --git a/include/linux/swap.h b/include/linux/swap.h index f6dd6575b9054..3e1909087f6a9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -397,9 +397,6 @@ void folio_deactivate(struct folio *folio); void folio_mark_lazyfree(struct folio *folio); extern void swap_setup(void); -extern void lru_cache_add_inactive_or_unevictable(struct page *page, - struct vm_area_struct *vma); - /* linux/mm/vmscan.c */ extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, -- cgit v1.2.3 From 8ba2f844f050a82624ba3ad5146aa3c116f506f7 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 28 Dec 2023 09:45:46 +0000 Subject: mm/zswap: change per-cpu mutex and buffer to per-acomp_ctx First of all, we need to rename acomp_ctx->dstmem field to buffer, since we are now using for purposes other than compression. Then we change per-cpu mutex and buffer to per-acomp_ctx, since them belong to the acomp_ctx and are necessary parts when used in the compress/decompress contexts. So we can remove the old per-cpu mutex and dstmem. Link: https://lkml.kernel.org/r/20231213-zswap-dstmem-v5-5-9382162bbf05@bytedance.com Signed-off-by: Chengming Zhou Acked-by: Chris Li (Google) Reviewed-by: Nhat Pham Cc: Barry Song <21cnbao@gmail.com> Cc: Dan Streetman Cc: Johannes Weiner Cc: Seth Jennings Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/cpuhotplug.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index efc0c0b07efb4..c3e06e21766a3 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -124,7 +124,6 @@ enum cpuhp_state { CPUHP_ARM_BL_PREPARE, CPUHP_TRACE_RB_PREPARE, CPUHP_MM_ZS_PREPARE, - CPUHP_MM_ZSWP_MEM_PREPARE, CPUHP_MM_ZSWP_POOL_PREPARE, CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_ZCOMP_PREPARE, -- cgit v1.2.3 From 96c7b0b42239e7b8987b2664b458dc74e825f760 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 13 Dec 2023 21:58:30 +0000 Subject: mm: return the folio from __read_swap_cache_async() Patch series "More swap folio conversions". These all seem like fairly straightforward conversions to me. A lot of compound_head() calls get removed. And page_swap_info(), which is nice. This patch (of 13): Move the folio->page conversion into the callers that actually want that. Most of the callers are happier with the folio anyway. If the page_allocated boolean is set, the folio allocated is of order-0, so it is safe to pass the page directly to swap_readpage(). Link: https://lkml.kernel.org/r/20231213215842.671461-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231213215842.671461-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/zswap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 08c240e16a01f..e88572d4c7202 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -34,7 +34,7 @@ void zswap_swapon(int type); void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); -void zswap_page_swapin(struct page *page); +void zswap_folio_swapin(struct folio *folio); #else struct zswap_lruvec_state {}; @@ -54,7 +54,7 @@ static inline void zswap_swapon(int type) {} static inline void zswap_swapoff(int type) {} static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {} -static inline void zswap_page_swapin(struct page *page) {} +static inline void zswap_folio_swapin(struct folio *folio) {} #endif #endif /* _LINUX_ZSWAP_H */ -- cgit v1.2.3 From 3a61e6f668120ee2c7840b91891c858d575d07e2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 13 Dec 2023 21:58:38 +0000 Subject: mm: convert swap_page_sector() to swap_folio_sector() All callers have a folio, so pass it in. Saves a couple of calls to compound_head(). Link: https://lkml.kernel.org/r/20231213215842.671461-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 3e1909087f6a9..2d09e9b7ee700 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -493,7 +493,7 @@ struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); extern struct swap_info_struct *get_swap_device(swp_entry_t entry); -sector_t swap_page_sector(struct page *page); +sector_t swap_folio_sector(struct folio *folio); static inline void put_swap_device(struct swap_info_struct *si) { -- cgit v1.2.3 From 69fe7d67cb0c6eeab3d4c9a3bf950f9d12af4719 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 13 Dec 2023 21:58:40 +0000 Subject: mm: remove page_swap_info() It's more efficient to get the swap_info_struct by calling swp_swap_info() directly. Link: https://lkml.kernel.org/r/20231213215842.671461-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 2d09e9b7ee700..4db00ddad2616 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -487,8 +487,7 @@ extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); extern int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); -extern struct swap_info_struct *page_swap_info(struct page *); -extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); +struct swap_info_struct *swp_swap_info(swp_entry_t entry); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); -- cgit v1.2.3 From f099c961f4998ad7107b1c6a7d6efb225e9a4614 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 15 Dec 2023 20:02:32 +0000 Subject: fs: remove clean_page_buffers() Patch series "Clean up the writeback paths". Most of these patches verge on the trivial, converting filesystems that just use block_write_full_page() to use mpage_writepages(). But as we saw with Christoph's earlier patchset, there can be some "interesting" gotchas, and I clearly haven't tested the majority of filesystems I've touched here. Patches 3 & 4 get rid of a lot of stack usage on architectures with larger page sizes; 1024 bytes on 64-bit systems with 64KiB pages. It starts to open the door to larger folio sizes on all architectures, but it's certainly not enough yet. Patch 14 is kind of trivial, but it's nice to get that simplification in. This patch (of 14): This function has been unused since the removal of bdev_write_page(). Link: https://lkml.kernel.org/r/20231215200245.748418-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231215200245.748418-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 5f23ee599889f..94f6161eb45eb 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -270,7 +270,6 @@ int generic_write_end(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page *, void *); void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to); -void clean_page_buffers(struct page *page); int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, struct page **, void **, get_block_t *, loff_t *); -- cgit v1.2.3 From 17bf23a981be9c6629198a76940c777eb5c8c521 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 15 Dec 2023 20:02:44 +0000 Subject: fs: convert block_write_full_page to block_write_full_folio Convert the function to be compatible with writepage_t so that it can be passed to write_cache_pages() by blkdev. This removes a call to compound_head(). We can also remove the function export as both callers are built-in. Link: https://lkml.kernel.org/r/20231215200245.748418-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 94f6161eb45eb..396b2adf24bf1 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -252,8 +252,8 @@ void __bh_read_batch(int nr, struct buffer_head *bhs[], * address_spaces. */ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length); -int block_write_full_page(struct page *page, get_block_t *get_block, - struct writeback_control *wbc); +int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, + void *get_block); int __block_write_full_folio(struct inode *inode, struct folio *folio, get_block_t *get_block, struct writeback_control *wbc, bh_end_io_t *handler); -- cgit v1.2.3 From 14059f66a959c760467ea2041e165f412845bcb8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 15 Dec 2023 20:02:45 +0000 Subject: fs: remove the bh_end_io argument from __block_write_full_folio All callers are passing end_buffer_async_write as this argument, so we can hardcode references to it within __block_write_full_folio(). That lets us make end_buffer_async_write() static. Link: https://lkml.kernel.org/r/20231215200245.748418-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jens Axboe Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 396b2adf24bf1..d78454a4dd1f0 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -205,7 +205,6 @@ struct buffer_head *create_empty_buffers(struct folio *folio, unsigned long blocksize, unsigned long b_state); void end_buffer_read_sync(struct buffer_head *bh, int uptodate); void end_buffer_write_sync(struct buffer_head *bh, int uptodate); -void end_buffer_async_write(struct buffer_head *bh, int uptodate); /* Things to do with buffers at mapping->private_list */ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); @@ -255,8 +254,7 @@ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length); int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, void *get_block); int __block_write_full_folio(struct inode *inode, struct folio *folio, - get_block_t *get_block, struct writeback_control *wbc, - bh_end_io_t *handler); + get_block_t *get_block, struct writeback_control *wbc); int block_read_full_folio(struct folio *, get_block_t *); bool block_is_partially_uptodate(struct folio *, size_t from, size_t count); int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, -- cgit v1.2.3 From 280ec6ccb6422aa4a04f9ac4216ddcf055acc95d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:28:45 +0100 Subject: kasan: rename kasan_slab_free_mempool to kasan_mempool_poison_object Patch series "kasan: save mempool stack traces". This series updates KASAN to save alloc and free stack traces for secondary-level allocators that cache and reuse allocations internally instead of giving them back to the underlying allocator (e.g. mempool). As a part of this change, introduce and document a set of KASAN hooks: bool kasan_mempool_poison_pages(struct page *page, unsigned int order); void kasan_mempool_unpoison_pages(struct page *page, unsigned int order); bool kasan_mempool_poison_object(void *ptr); void kasan_mempool_unpoison_object(void *ptr, size_t size); and use them in the mempool code. Besides mempool, skbuff and io_uring also cache allocations and already use KASAN hooks to poison those. Their code is updated to use the new mempool hooks. The new hooks save alloc and free stack traces (for normal kmalloc and slab objects; stack traces for large kmalloc objects and page_alloc are not supported by KASAN yet), improve the readability of the users' code, and also allow the users to prevent double-free and invalid-free bugs; see the patches for the details. This patch (of 21): Rename kasan_slab_free_mempool to kasan_mempool_poison_object. kasan_slab_free_mempool is a slightly confusing name: it is unclear whether this function poisons the object when it is freed into mempool or does something when the object is freed from mempool to the underlying allocator. The new name also aligns with other mempool-related KASAN hooks added in the following patches in this series. Link: https://lkml.kernel.org/r/cover.1703024586.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/c5618685abb7cdbf9fb4897f565e7759f601da84.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/kasan.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 72cb693b075b7..6310435f528b3 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -172,11 +172,11 @@ static __always_inline void kasan_kfree_large(void *ptr) __kasan_kfree_large(ptr, _RET_IP_); } -void __kasan_slab_free_mempool(void *ptr, unsigned long ip); -static __always_inline void kasan_slab_free_mempool(void *ptr) +void __kasan_mempool_poison_object(void *ptr, unsigned long ip); +static __always_inline void kasan_mempool_poison_object(void *ptr) { if (kasan_enabled()) - __kasan_slab_free_mempool(ptr, _RET_IP_); + __kasan_mempool_poison_object(ptr, _RET_IP_); } void * __must_check __kasan_slab_alloc(struct kmem_cache *s, @@ -256,7 +256,7 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init return false; } static inline void kasan_kfree_large(void *ptr) {} -static inline void kasan_slab_free_mempool(void *ptr) {} +static inline void kasan_mempool_poison_object(void *ptr) {} static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags, bool init) { -- cgit v1.2.3 From 9b94fe91099cbf05606151ef05bea9632666f5d5 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:28:46 +0100 Subject: kasan: move kasan_mempool_poison_object Move kasan_mempool_poison_object after all slab-related KASAN hooks. This is a preparatory change for the following patches in this series. No functional changes. Link: https://lkml.kernel.org/r/23ea215409f43c13cdf9ecc454501a264c107d67.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/kasan.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 6310435f528b3..0d1f925c136d9 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -172,13 +172,6 @@ static __always_inline void kasan_kfree_large(void *ptr) __kasan_kfree_large(ptr, _RET_IP_); } -void __kasan_mempool_poison_object(void *ptr, unsigned long ip); -static __always_inline void kasan_mempool_poison_object(void *ptr) -{ - if (kasan_enabled()) - __kasan_mempool_poison_object(ptr, _RET_IP_); -} - void * __must_check __kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags, bool init); static __always_inline void * __must_check kasan_slab_alloc( @@ -219,6 +212,13 @@ static __always_inline void * __must_check kasan_krealloc(const void *object, return (void *)object; } +void __kasan_mempool_poison_object(void *ptr, unsigned long ip); +static __always_inline void kasan_mempool_poison_object(void *ptr) +{ + if (kasan_enabled()) + __kasan_mempool_poison_object(ptr, _RET_IP_); +} + /* * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for * the hardware tag-based mode that doesn't rely on compiler instrumentation. @@ -256,7 +256,6 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init return false; } static inline void kasan_kfree_large(void *ptr) {} -static inline void kasan_mempool_poison_object(void *ptr) {} static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags, bool init) { @@ -276,6 +275,7 @@ static inline void *kasan_krealloc(const void *object, size_t new_size, { return (void *)object; } +static inline void kasan_mempool_poison_object(void *ptr) {} static inline bool kasan_check_byte(const void *address) { return true; -- cgit v1.2.3 From 1bb843048d00050678c392dab87a15c8b756df6f Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:28:47 +0100 Subject: kasan: document kasan_mempool_poison_object Add documentation comment for kasan_mempool_poison_object. Link: https://lkml.kernel.org/r/af33ba8cabfa1ad731fe23a3f874bfc8d3b7fed4.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/kasan.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 0d1f925c136d9..bbf6e2fa4ffd9 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -213,6 +213,24 @@ static __always_inline void * __must_check kasan_krealloc(const void *object, } void __kasan_mempool_poison_object(void *ptr, unsigned long ip); +/** + * kasan_mempool_poison_object - Check and poison a mempool slab allocation. + * @ptr: Pointer to the slab allocation. + * + * This function is intended for kernel subsystems that cache slab allocations + * to reuse them instead of freeing them back to the slab allocator (e.g. + * mempool). + * + * This function poisons a slab allocation without initializing its memory and + * without putting it into the quarantine (for the Generic mode). + * + * This function also performs checks to detect double-free and invalid-free + * bugs and reports them. + * + * This function operates on all slab allocations including large kmalloc + * allocations (the ones returned by kmalloc_large() or by kmalloc() with the + * size > KMALLOC_MAX_SIZE). + */ static __always_inline void kasan_mempool_poison_object(void *ptr) { if (kasan_enabled()) -- cgit v1.2.3 From 2e7c954c11af96aa1e0566a706f22152ef91d759 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:28:48 +0100 Subject: kasan: add return value for kasan_mempool_poison_object Add a return value for kasan_mempool_poison_object that lets the caller know whether the allocation is affected by a double-free or an invalid-free bug. The caller can use this return value to stop operating on the object. Also introduce a check_page_allocation helper function to improve the code readability. Link: https://lkml.kernel.org/r/618af65273875fb9f56954285443279b15f1fcd9.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/kasan.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index bbf6e2fa4ffd9..33387e254caae 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -212,7 +212,7 @@ static __always_inline void * __must_check kasan_krealloc(const void *object, return (void *)object; } -void __kasan_mempool_poison_object(void *ptr, unsigned long ip); +bool __kasan_mempool_poison_object(void *ptr, unsigned long ip); /** * kasan_mempool_poison_object - Check and poison a mempool slab allocation. * @ptr: Pointer to the slab allocation. @@ -225,16 +225,20 @@ void __kasan_mempool_poison_object(void *ptr, unsigned long ip); * without putting it into the quarantine (for the Generic mode). * * This function also performs checks to detect double-free and invalid-free - * bugs and reports them. + * bugs and reports them. The caller can use the return value of this function + * to find out if the allocation is buggy. * * This function operates on all slab allocations including large kmalloc * allocations (the ones returned by kmalloc_large() or by kmalloc() with the * size > KMALLOC_MAX_SIZE). + * + * Return: true if the allocation can be safely reused; false otherwise. */ -static __always_inline void kasan_mempool_poison_object(void *ptr) +static __always_inline bool kasan_mempool_poison_object(void *ptr) { if (kasan_enabled()) - __kasan_mempool_poison_object(ptr, _RET_IP_); + return __kasan_mempool_poison_object(ptr, _RET_IP_); + return true; } /* @@ -293,7 +297,10 @@ static inline void *kasan_krealloc(const void *object, size_t new_size, { return (void *)object; } -static inline void kasan_mempool_poison_object(void *ptr) {} +static inline bool kasan_mempool_poison_object(void *ptr) +{ + return true; +} static inline bool kasan_check_byte(const void *address) { return true; -- cgit v1.2.3 From 1956832753735b1c399b86b2c66cb7c317dc9f31 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:28:49 +0100 Subject: kasan: introduce kasan_mempool_unpoison_object Introduce and document a kasan_mempool_unpoison_object hook. This hook serves as a replacement for the generic kasan_unpoison_range that the mempool code relies on right now. mempool will be updated to use the new hook in one of the following patches. For now, define the new hook to be identical to kasan_unpoison_range. One of the following patches will update it to add stack trace collection. Link: https://lkml.kernel.org/r/dae25f0e18ed8fd50efe509c5b71a0592de5c18d.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/kasan.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 33387e254caae..c5fe303bc1c2b 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -228,6 +228,9 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip); * bugs and reports them. The caller can use the return value of this function * to find out if the allocation is buggy. * + * Before the poisoned allocation can be reused, it must be unpoisoned via + * kasan_mempool_unpoison_object(). + * * This function operates on all slab allocations including large kmalloc * allocations (the ones returned by kmalloc_large() or by kmalloc() with the * size > KMALLOC_MAX_SIZE). @@ -241,6 +244,32 @@ static __always_inline bool kasan_mempool_poison_object(void *ptr) return true; } +void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip); +/** + * kasan_mempool_unpoison_object - Unpoison a mempool slab allocation. + * @ptr: Pointer to the slab allocation. + * @size: Size to be unpoisoned. + * + * This function is intended for kernel subsystems that cache slab allocations + * to reuse them instead of freeing them back to the slab allocator (e.g. + * mempool). + * + * This function unpoisons a slab allocation that was previously poisoned via + * kasan_mempool_poison_object() without initializing its memory. For the + * tag-based modes, this function does not assign a new tag to the allocation + * and instead restores the original tags based on the pointer value. + * + * This function operates on all slab allocations including large kmalloc + * allocations (the ones returned by kmalloc_large() or by kmalloc() with the + * size > KMALLOC_MAX_SIZE). + */ +static __always_inline void kasan_mempool_unpoison_object(void *ptr, + size_t size) +{ + if (kasan_enabled()) + __kasan_mempool_unpoison_object(ptr, size, _RET_IP_); +} + /* * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for * the hardware tag-based mode that doesn't rely on compiler instrumentation. @@ -301,6 +330,8 @@ static inline bool kasan_mempool_poison_object(void *ptr) { return true; } +static inline void kasan_mempool_unpoison_object(void *ptr, size_t size) {} + static inline bool kasan_check_byte(const void *address) { return true; -- cgit v1.2.3 From f129c31039283df884913142b0f3797d64d3a9d6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:28:50 +0100 Subject: kasan: introduce kasan_mempool_poison_pages Introduce and document a kasan_mempool_poison_pages hook to be used by the mempool code instead of kasan_poison_pages. Compated to kasan_poison_pages, the new hook: 1. For the tag-based modes, skips checking and poisoning allocations that were not tagged due to sampling. 2. Checks for double-free and invalid-free bugs. In the future, kasan_poison_pages can also be updated to handle #2, but this is out-of-scope of this series. Link: https://lkml.kernel.org/r/88dc7340cce28249abf789f6e0c792c317df9ba5.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/kasan.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index c5fe303bc1c2b..de2a695ad34d4 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -212,6 +212,29 @@ static __always_inline void * __must_check kasan_krealloc(const void *object, return (void *)object; } +bool __kasan_mempool_poison_pages(struct page *page, unsigned int order, + unsigned long ip); +/** + * kasan_mempool_poison_pages - Check and poison a mempool page allocation. + * @page: Pointer to the page allocation. + * @order: Order of the allocation. + * + * This function is intended for kernel subsystems that cache page allocations + * to reuse them instead of freeing them back to page_alloc (e.g. mempool). + * + * This function is similar to kasan_mempool_poison_object() but operates on + * page allocations. + * + * Return: true if the allocation can be safely reused; false otherwise. + */ +static __always_inline bool kasan_mempool_poison_pages(struct page *page, + unsigned int order) +{ + if (kasan_enabled()) + return __kasan_mempool_poison_pages(page, order, _RET_IP_); + return true; +} + bool __kasan_mempool_poison_object(void *ptr, unsigned long ip); /** * kasan_mempool_poison_object - Check and poison a mempool slab allocation. @@ -326,6 +349,10 @@ static inline void *kasan_krealloc(const void *object, size_t new_size, { return (void *)object; } +static inline bool kasan_mempool_poison_pages(struct page *page, unsigned int order) +{ + return true; +} static inline bool kasan_mempool_poison_object(void *ptr) { return true; -- cgit v1.2.3 From 9f41c59ae3163690868a32bd77e9e33c3bab555e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:28:51 +0100 Subject: kasan: introduce kasan_mempool_unpoison_pages Introduce and document a new kasan_mempool_unpoison_pages hook to be used by the mempool code instead of kasan_unpoison_pages. This hook is not functionally different from kasan_unpoison_pages, but using it improves the mempool code readability. Link: https://lkml.kernel.org/r/239bd9af6176f2cc59f5c25893eb36143184daff.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/kasan.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index de2a695ad34d4..f8ebde384bd75 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -225,6 +225,9 @@ bool __kasan_mempool_poison_pages(struct page *page, unsigned int order, * This function is similar to kasan_mempool_poison_object() but operates on * page allocations. * + * Before the poisoned allocation can be reused, it must be unpoisoned via + * kasan_mempool_unpoison_pages(). + * * Return: true if the allocation can be safely reused; false otherwise. */ static __always_inline bool kasan_mempool_poison_pages(struct page *page, @@ -235,6 +238,27 @@ static __always_inline bool kasan_mempool_poison_pages(struct page *page, return true; } +void __kasan_mempool_unpoison_pages(struct page *page, unsigned int order, + unsigned long ip); +/** + * kasan_mempool_unpoison_pages - Unpoison a mempool page allocation. + * @page: Pointer to the page allocation. + * @order: Order of the allocation. + * + * This function is intended for kernel subsystems that cache page allocations + * to reuse them instead of freeing them back to page_alloc (e.g. mempool). + * + * This function unpoisons a page allocation that was previously poisoned by + * kasan_mempool_poison_pages() without zeroing the allocation's memory. For + * the tag-based modes, this function assigns a new tag to the allocation. + */ +static __always_inline void kasan_mempool_unpoison_pages(struct page *page, + unsigned int order) +{ + if (kasan_enabled()) + __kasan_mempool_unpoison_pages(page, order, _RET_IP_); +} + bool __kasan_mempool_poison_object(void *ptr, unsigned long ip); /** * kasan_mempool_poison_object - Check and poison a mempool slab allocation. @@ -353,6 +377,7 @@ static inline bool kasan_mempool_poison_pages(struct page *page, unsigned int or { return true; } +static inline void kasan_mempool_unpoison_pages(struct page *page, unsigned int order) {} static inline bool kasan_mempool_poison_object(void *ptr) { return true; -- cgit v1.2.3 From b556a462eb8df6b6836c318d23f43409c40a7c7e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:28:53 +0100 Subject: kasan: save free stack traces for slab mempools Make kasan_mempool_poison_object save free stack traces for slab and kmalloc mempools when the object is freed into the mempool. Also simplify and rename ____kasan_slab_free to poison_slab_object and do a few other reability changes. Link: https://lkml.kernel.org/r/413a7c7c3344fb56809853339ffaabc9e4905e94.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/kasan.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index f8ebde384bd75..e636a00e26bac 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -268,8 +268,9 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip); * to reuse them instead of freeing them back to the slab allocator (e.g. * mempool). * - * This function poisons a slab allocation without initializing its memory and - * without putting it into the quarantine (for the Generic mode). + * This function poisons a slab allocation and saves a free stack trace for it + * without initializing the allocation's memory and without putting it into the + * quarantine (for the Generic mode). * * This function also performs checks to detect double-free and invalid-free * bugs and reports them. The caller can use the return value of this function -- cgit v1.2.3 From 29d7355a9d05de9a6e38cc4d1146fb96c43853fb Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:28:56 +0100 Subject: kasan: save alloc stack traces for mempool Update kasan_mempool_unpoison_object to properly poison the redzone and save alloc strack traces for kmalloc and slab pools. As a part of this change, split out and use a unpoison_slab_object helper function from __kasan_slab_alloc. [nathan@kernel.org: mark unpoison_slab_object() as static] Link: https://lkml.kernel.org/r/20231221180042.104694-1-andrey.konovalov@linux.dev Link: https://lkml.kernel.org/r/05ad235da8347cfe14d496d01b2aaf074b4f607c.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Signed-off-by: Nathan Chancellor Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/kasan.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index e636a00e26bac..7392c5d89b920 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -303,9 +303,10 @@ void __kasan_mempool_unpoison_object(void *ptr, size_t size, unsigned long ip); * mempool). * * This function unpoisons a slab allocation that was previously poisoned via - * kasan_mempool_poison_object() without initializing its memory. For the - * tag-based modes, this function does not assign a new tag to the allocation - * and instead restores the original tags based on the pointer value. + * kasan_mempool_poison_object() and saves an alloc stack trace for it without + * initializing the allocation's memory. For the tag-based modes, this function + * does not assign a new tag to the allocation and instead restores the + * original tags based on the pointer value. * * This function operates on all slab allocations including large kmalloc * allocations (the ones returned by kmalloc_large() or by kmalloc() with the -- cgit v1.2.3 From 37dcc69ad17a008d2b720bdc39f070ef2a959430 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:28:59 +0100 Subject: mempool: introduce mempool_use_prealloc_only Introduce a new mempool_alloc_preallocated API that asks the mempool to only use the elements preallocated during the mempool's creation when allocating and to not attempt allocating new ones from the underlying allocator. This API is required to test the KASAN poisoning/unpoisoning functionality in KASAN tests, but it might be also useful on its own. Link: https://lkml.kernel.org/r/a14d809dbdfd04cc33bcacc632fee2abd6b83c00.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/mempool.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 4aae6c06c5f28..7be1e32e6d421 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -51,6 +51,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, extern int mempool_resize(mempool_t *pool, int new_min_nr); extern void mempool_destroy(mempool_t *pool); extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc; +extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc; extern void mempool_free(void *element, mempool_t *pool); /* -- cgit v1.2.3 From 1ce9a0523938f87dd8505233cc3445f8e2d8dcee Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 23:29:03 +0100 Subject: kasan: rename and document kasan_(un)poison_object_data Rename kasan_unpoison_object_data to kasan_unpoison_new_object and add a documentation comment. Do the same for kasan_poison_object_data. The new names and the comments should suggest the users that these hooks are intended for internal use by the slab allocator. The following patch will remove non-slab-internal uses of these hooks. No functional changes. [andreyknvl@google.com: update references to renamed functions in comments] Link: https://lkml.kernel.org/r/20231221180637.105098-1-andrey.konovalov@linux.dev Link: https://lkml.kernel.org/r/eab156ebbd635f9635ef67d1a4271f716994e628.1703024586.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Lobakin Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Breno Leitao Cc: Dmitry Vyukov Cc: Evgenii Stepanov Signed-off-by: Andrew Morton --- include/linux/kasan.h | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 7392c5d89b920..d49e3d4c099ef 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -129,20 +129,39 @@ static __always_inline void kasan_poison_slab(struct slab *slab) __kasan_poison_slab(slab); } -void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object); -static __always_inline void kasan_unpoison_object_data(struct kmem_cache *cache, +void __kasan_unpoison_new_object(struct kmem_cache *cache, void *object); +/** + * kasan_unpoison_new_object - Temporarily unpoison a new slab object. + * @cache: Cache the object belong to. + * @object: Pointer to the object. + * + * This function is intended for the slab allocator's internal use. It + * temporarily unpoisons an object from a newly allocated slab without doing + * anything else. The object must later be repoisoned by + * kasan_poison_new_object(). + */ +static __always_inline void kasan_unpoison_new_object(struct kmem_cache *cache, void *object) { if (kasan_enabled()) - __kasan_unpoison_object_data(cache, object); + __kasan_unpoison_new_object(cache, object); } -void __kasan_poison_object_data(struct kmem_cache *cache, void *object); -static __always_inline void kasan_poison_object_data(struct kmem_cache *cache, +void __kasan_poison_new_object(struct kmem_cache *cache, void *object); +/** + * kasan_unpoison_new_object - Repoison a new slab object. + * @cache: Cache the object belong to. + * @object: Pointer to the object. + * + * This function is intended for the slab allocator's internal use. It + * repoisons an object that was previously unpoisoned by + * kasan_unpoison_new_object() without doing anything else. + */ +static __always_inline void kasan_poison_new_object(struct kmem_cache *cache, void *object) { if (kasan_enabled()) - __kasan_poison_object_data(cache, object); + __kasan_poison_new_object(cache, object); } void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, @@ -342,9 +361,9 @@ static inline bool kasan_unpoison_pages(struct page *page, unsigned int order, return false; } static inline void kasan_poison_slab(struct slab *slab) {} -static inline void kasan_unpoison_object_data(struct kmem_cache *cache, +static inline void kasan_unpoison_new_object(struct kmem_cache *cache, void *object) {} -static inline void kasan_poison_object_data(struct kmem_cache *cache, +static inline void kasan_poison_new_object(struct kmem_cache *cache, void *object) {} static inline void *kasan_init_slab_obj(struct kmem_cache *cache, const void *object) -- cgit v1.2.3 From 91349f541e7daa6cce15e01e7ffe4fd63731ead9 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 19 Dec 2023 22:19:53 +0100 Subject: lib/stackdepot: fix comment in include/linux/stackdepot.h As stack traces can now be evicted from the stack depot, remove the comment saying that they are never removed. Link: https://lkml.kernel.org/r/0ebe712d91f8d302a8947d3c9e9123bc2b1b8440.1703020707.git.andreyknvl@google.com Fixes: 108be8def46e ("lib/stackdepot: allow users to evict stack traces") Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Tetsuo Handa Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index a6796f1789138..adcbb8f236000 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -11,8 +11,6 @@ * SLUB_DEBUG needs 256 bytes per object for that). Since allocation and free * stack traces often repeat, using stack depot allows to save about 100x space. * - * Stack traces are never removed from the stack depot. - * * Author: Alexander Potapenko * Copyright (C) 2016 Google, Inc. * -- cgit v1.2.3 From 7fbb5e188248c50f737720825da1864ce42536d1 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 19 Dec 2023 21:41:23 -0800 Subject: mm: remove VM_EXEC requirement for THP eligibility Commit e6be37b2e7bd ("mm/huge_memory.c: add missing read-only THP checking in transparent_hugepage_enabled()") introduced the VM_EXEC requirement, which is not strictly needed. lld's default --rosegment option and GNU ld's -z separate-code option (default on Linux/x86 since binutils 2.31) create a read-only PT_LOAD segment without the PF_X flag, which should be eligible for THP. Certain architectures support medium and large code models, where .lrodata may be placed in a separate read-only PT_LOAD segment, which should be eligible for THP as well. Link: https://lkml.kernel.org/r/20231220054123.1266001-1-maskray@google.com Signed-off-by: Fangrui Song Acked-by: Yang Shi Cc: Miaohe Lin Cc: Song Liu Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index fa7a38a30fc68..5adb86af35fc4 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -206,7 +206,6 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) inode = vma->vm_file->f_inode; return (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) && - (vma->vm_flags & VM_EXEC) && !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } -- cgit v1.2.3 From 5ec8e8ea8b7783fab150cf86404fc38cb4db8800 Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Fri, 13 Oct 2023 18:34:27 +0530 Subject: mm/sparsemem: fix race in accessing memory_section->usage The below race is observed on a PFN which falls into the device memory region with the system memory configuration where PFN's are such that [ZONE_NORMAL ZONE_DEVICE ZONE_NORMAL]. Since normal zone start and end pfn contains the device memory PFN's as well, the compaction triggered will try on the device memory PFN's too though they end up in NOP(because pfn_to_online_page() returns NULL for ZONE_DEVICE memory sections). When from other core, the section mappings are being removed for the ZONE_DEVICE region, that the PFN in question belongs to, on which compaction is currently being operated is resulting into the kernel crash with CONFIG_SPASEMEM_VMEMAP enabled. The crash logs can be seen at [1]. compact_zone() memunmap_pages ------------- --------------- __pageblock_pfn_to_page ...... (a)pfn_valid(): valid_section()//return true (b)__remove_pages()-> sparse_remove_section()-> section_deactivate(): [Free the array ms->usage and set ms->usage = NULL] pfn_section_valid() [Access ms->usage which is NULL] NOTE: From the above it can be said that the race is reduced to between the pfn_valid()/pfn_section_valid() and the section deactivate with SPASEMEM_VMEMAP enabled. The commit b943f045a9af("mm/sparse: fix kernel crash with pfn_section_valid check") tried to address the same problem by clearing the SECTION_HAS_MEM_MAP with the expectation of valid_section() returns false thus ms->usage is not accessed. Fix this issue by the below steps: a) Clear SECTION_HAS_MEM_MAP before freeing the ->usage. b) RCU protected read side critical section will either return NULL when SECTION_HAS_MEM_MAP is cleared or can successfully access ->usage. c) Free the ->usage with kfree_rcu() and set ms->usage = NULL. No attempt will be made to access ->usage after this as the SECTION_HAS_MEM_MAP is cleared thus valid_section() return false. Thanks to David/Pavan for their inputs on this patch. [1] https://lore.kernel.org/linux-mm/994410bb-89aa-d987-1f50-f514903c55aa@quicinc.com/ On Snapdragon SoC, with the mentioned memory configuration of PFN's as [ZONE_NORMAL ZONE_DEVICE ZONE_NORMAL], we are able to see bunch of issues daily while testing on a device farm. For this particular issue below is the log. Though the below log is not directly pointing to the pfn_section_valid(){ ms->usage;}, when we loaded this dump on T32 lauterbach tool, it is pointing. [ 540.578056] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 [ 540.578068] Mem abort info: [ 540.578070] ESR = 0x0000000096000005 [ 540.578073] EC = 0x25: DABT (current EL), IL = 32 bits [ 540.578077] SET = 0, FnV = 0 [ 540.578080] EA = 0, S1PTW = 0 [ 540.578082] FSC = 0x05: level 1 translation fault [ 540.578085] Data abort info: [ 540.578086] ISV = 0, ISS = 0x00000005 [ 540.578088] CM = 0, WnR = 0 [ 540.579431] pstate: 82400005 (Nzcv daif +PAN -UAO +TCO -DIT -SSBSBTYPE=--) [ 540.579436] pc : __pageblock_pfn_to_page+0x6c/0x14c [ 540.579454] lr : compact_zone+0x994/0x1058 [ 540.579460] sp : ffffffc03579b510 [ 540.579463] x29: ffffffc03579b510 x28: 0000000000235800 x27:000000000000000c [ 540.579470] x26: 0000000000235c00 x25: 0000000000000068 x24:ffffffc03579b640 [ 540.579477] x23: 0000000000000001 x22: ffffffc03579b660 x21:0000000000000000 [ 540.579483] x20: 0000000000235bff x19: ffffffdebf7e3940 x18:ffffffdebf66d140 [ 540.579489] x17: 00000000739ba063 x16: 00000000739ba063 x15:00000000009f4bff [ 540.579495] x14: 0000008000000000 x13: 0000000000000000 x12:0000000000000001 [ 540.579501] x11: 0000000000000000 x10: 0000000000000000 x9 :ffffff897d2cd440 [ 540.579507] x8 : 0000000000000000 x7 : 0000000000000000 x6 :ffffffc03579b5b4 [ 540.579512] x5 : 0000000000027f25 x4 : ffffffc03579b5b8 x3 :0000000000000001 [ 540.579518] x2 : ffffffdebf7e3940 x1 : 0000000000235c00 x0 :0000000000235800 [ 540.579524] Call trace: [ 540.579527] __pageblock_pfn_to_page+0x6c/0x14c [ 540.579533] compact_zone+0x994/0x1058 [ 540.579536] try_to_compact_pages+0x128/0x378 [ 540.579540] __alloc_pages_direct_compact+0x80/0x2b0 [ 540.579544] __alloc_pages_slowpath+0x5c0/0xe10 [ 540.579547] __alloc_pages+0x250/0x2d0 [ 540.579550] __iommu_dma_alloc_noncontiguous+0x13c/0x3fc [ 540.579561] iommu_dma_alloc+0xa0/0x320 [ 540.579565] dma_alloc_attrs+0xd4/0x108 [quic_charante@quicinc.com: use kfree_rcu() in place of synchronize_rcu(), per David] Link: https://lkml.kernel.org/r/1698403778-20938-1-git-send-email-quic_charante@quicinc.com Link: https://lkml.kernel.org/r/1697202267-23600-1-git-send-email-quic_charante@quicinc.com Fixes: f46edbd1b151 ("mm/sparsemem: add helpers track active portions of a section at boot") Signed-off-by: Charan Teja Kalla Cc: Aneesh Kumar K.V Cc: Dan Williams Cc: David Hildenbrand Cc: Mel Gorman Cc: Oscar Salvador Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ec73582e7d278..2efd3be484fdd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1799,6 +1799,7 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec) #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK) struct mem_section_usage { + struct rcu_head rcu; #ifdef CONFIG_SPARSEMEM_VMEMMAP DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION); #endif @@ -1992,7 +1993,7 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { int idx = subsection_map_index(pfn); - return test_bit(idx, ms->usage->subsection_map); + return test_bit(idx, READ_ONCE(ms->usage)->subsection_map); } #else static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) @@ -2016,6 +2017,7 @@ static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) static inline int pfn_valid(unsigned long pfn) { struct mem_section *ms; + int ret; /* * Ensure the upper PAGE_SHIFT bits are clear in the @@ -2029,13 +2031,19 @@ static inline int pfn_valid(unsigned long pfn) if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; ms = __pfn_to_section(pfn); - if (!valid_section(ms)) + rcu_read_lock(); + if (!valid_section(ms)) { + rcu_read_unlock(); return 0; + } /* * Traditionally early sections always returned pfn_valid() for * the entire section-sized span. */ - return early_section(ms) || pfn_section_valid(ms, pfn); + ret = early_section(ms) || pfn_section_valid(ms, pfn); + rcu_read_unlock(); + + return ret; } #endif -- cgit v1.2.3 From 5cb6674b694b84803cbee8bfccaa2bfdfeb6eae4 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 21 Dec 2023 21:04:44 +0100 Subject: mm, kasan: use KASAN_TAG_KERNEL instead of 0xff Use the KASAN_TAG_KERNEL marco instead of open-coding 0xff in the mm code. This macro is provided by include/linux/kasan-tags.h, which does not include any other headers, so it's safe to include it into mm.h without causing circular include dependencies. Link: https://lkml.kernel.org/r/71db9087b0aebb6c4dccbc609cc0cd50621533c7.1703188911.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/kasan.h | 1 + include/linux/mm.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d49e3d4c099ef..dbb06d789e74e 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -4,6 +4,7 @@ #include #include +#include #include #include #include diff --git a/include/linux/mm.h b/include/linux/mm.h index b72bf25a45cfd..2563ffdb51bc7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1815,7 +1815,7 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) static inline u8 page_kasan_tag(const struct page *page) { - u8 tag = 0xff; + u8 tag = KASAN_TAG_KERNEL; if (kasan_enabled()) { tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; @@ -1844,7 +1844,7 @@ static inline void page_kasan_tag_set(struct page *page, u8 tag) static inline void page_kasan_tag_reset(struct page *page) { if (kasan_enabled()) - page_kasan_tag_set(page, 0xff); + page_kasan_tag_set(page, KASAN_TAG_KERNEL); } #else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ -- cgit v1.2.3 From 9d5fafd5d882446999366f673ab06edba453f862 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:25 +0100 Subject: mm/rmap: rename hugepage_add* to hugetlb_add* Patch series "mm/rmap: interface overhaul", v2. This series overhauls the rmap interface, to get rid of the "bool compound" / RMAP_COMPOUND parameter with the goal of making the interface less error prone, more future proof, and more natural to extend to "batching". Also, this converts the interface to always consume folio+subpage, which speeds up operations on large folios. Further, this series adds PTE-batching variants for 4 rmap functions, whereby only folio_add_anon_rmap_ptes() is used for batching in this series when PTE-remapping a PMD-mapped THP. folio_remove_rmap_ptes(), folio_try_dup_anon_rmap_ptes() and folio_dup_file_rmap_ptes() will soon come in handy[1,2]. This series performs a lot of folio conversion along the way. Most of the added LOC in the diff are only due to documentation. As we're moving to a pte/pmd interface where we clearly express the mapping granularity we are dealing with, we first get the remainder of hugetlb out of the way, as it is special and expected to remain special: it treats everything as a "single logical PTE" and only currently allows entire mappings. Even if we'd ever support partial mappings, I strongly assume the interface and implementation will still differ heavily: hopefull we can avoid working on subpages/subpage mapcounts completely and only add a "count" parameter for them to enable batching. New (extended) hugetlb interface that operates on entire folio: * hugetlb_add_new_anon_rmap() -> Already existed * hugetlb_add_anon_rmap() -> Already existed * hugetlb_try_dup_anon_rmap() * hugetlb_try_share_anon_rmap() * hugetlb_add_file_rmap() * hugetlb_remove_rmap() New "ordinary" interface for small folios / THP:: * folio_add_new_anon_rmap() -> Already existed * folio_add_anon_rmap_[pte|ptes|pmd]() * folio_try_dup_anon_rmap_[pte|ptes|pmd]() * folio_try_share_anon_rmap_[pte|pmd]() * folio_add_file_rmap_[pte|ptes|pmd]() * folio_dup_file_rmap_[pte|ptes|pmd]() * folio_remove_rmap_[pte|ptes|pmd]() folio_add_new_anon_rmap() will always map at the largest granularity possible (currently, a single PMD to cover a PMD-sized THP). Could be extended if ever required. In the future, we might want "_pud" variants and eventually "_pmds" variants for batching. I ran some simple microbenchmarks on an Intel(R) Xeon(R) Silver 4210R: measuring munmap(), fork(), cow, MADV_DONTNEED on each PTE ... and PTE remapping PMD-mapped THPs on 1 GiB of memory. For small folios, there is barely a change (< 1% improvement for me). For PTE-mapped THP: * PTE-remapping a PMD-mapped THP is more than 10% faster. * fork() is more than 4% faster. * MADV_DONTNEED is 2% faster * COW when writing only a single byte on a COW-shared PTE is 1% faster * munmap() barely changes (< 1%). [1] https://lkml.kernel.org/r/20230810103332.3062143-1-ryan.roberts@arm.com [2] https://lkml.kernel.org/r/20231204105440.61448-1-ryan.roberts@arm.com This patch (of 40): Let's just call it "hugetlb_". Yes, it's all already inconsistent and confusing because we have a lot of "hugepage_" functions for legacy reasons. But "hugetlb" cannot possibly be confused with transparent huge pages, and it matches "hugetlb.c" and "folio_test_hugetlb()". So let's minimize confusion in rmap code. Link: https://lkml.kernel.org/r/20231220224504.646757-1-david@redhat.com Link: https://lkml.kernel.org/r/20231220224504.646757-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Muchun Song Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/rmap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 0ae2bb0e77f5d..36096ba69bdcd 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -206,9 +206,9 @@ void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr, void page_remove_rmap(struct page *, struct vm_area_struct *, bool compound); -void hugepage_add_anon_rmap(struct folio *, struct vm_area_struct *, +void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address, rmap_t flags); -void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *, +void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); static inline void __page_dup_rmap(struct page *page, bool compound) -- cgit v1.2.3 From e135826b2da0cf25305086dc9ac1e91718a148e1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:26 +0100 Subject: mm/rmap: introduce and use hugetlb_remove_rmap() hugetlb rmap handling differs quite a lot from "ordinary" rmap code. For example, hugetlb currently only supports entire mappings, and treats any mapping as mapped using a single "logical PTE". Let's move it out of the way so we can overhaul our "ordinary" rmap. implementation/interface. Let's introduce and use hugetlb_remove_rmap() and remove the hugetlb code from page_remove_rmap(). This effectively removes one check on the small-folio path as well. Add sanity checks that we end up with the right folios in the right functions. Note: all possible candidates that need care are page_remove_rmap() that pass compound=true. Link: https://lkml.kernel.org/r/20231220224504.646757-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Yin Fengwei Reviewed-by: Ryan Roberts Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Muchun Song Cc: Hugh Dickins Cc: Muchun Song Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/rmap.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 36096ba69bdcd..64ae6c4d72720 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -211,6 +211,13 @@ void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); +static inline void hugetlb_remove_rmap(struct folio *folio) +{ + VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); + + atomic_dec(&folio->_entire_mapcount); +} + static inline void __page_dup_rmap(struct page *page, bool compound) { if (compound) { -- cgit v1.2.3 From 44887f39945519fa8405133b1acd098fda9c9746 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:27 +0100 Subject: mm/rmap: introduce and use hugetlb_add_file_rmap() hugetlb rmap handling differs quite a lot from "ordinary" rmap code. For example, hugetlb currently only supports entire mappings, and treats any mapping as mapped using a single "logical PTE". Let's move it out of the way so we can overhaul our "ordinary" rmap. implementation/interface. Right now we're using page_dup_file_rmap() in some cases where "ordinary" rmap code would have used page_add_file_rmap(). So let's introduce and use hugetlb_add_file_rmap() instead. We won't be adding a "hugetlb_dup_file_rmap()" functon for the fork() case, as it would be doing the same: "dup" is just an optimization for "add". What remains is a single page_dup_file_rmap() call in fork() code. Add sanity checks that we end up with the right folios in the right functions. Link: https://lkml.kernel.org/r/20231220224504.646757-4-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Yin Fengwei Reviewed-by: Ryan Roberts Reviewed-by: Muchun Song Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/rmap.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 64ae6c4d72720..56900a16f41a6 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -211,6 +211,14 @@ void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); +static inline void hugetlb_add_file_rmap(struct folio *folio) +{ + VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); + VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); + + atomic_inc(&folio->_entire_mapcount); +} + static inline void hugetlb_remove_rmap(struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); -- cgit v1.2.3 From ebe2e35ec0f256372c158a18de459fb60070b313 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:28 +0100 Subject: mm/rmap: introduce and use hugetlb_try_dup_anon_rmap() hugetlb rmap handling differs quite a lot from "ordinary" rmap code. For example, hugetlb currently only supports entire mappings, and treats any mapping as mapped using a single "logical PTE". Let's move it out of the way so we can overhaul our "ordinary" rmap. implementation/interface. So let's introduce and use hugetlb_try_dup_anon_rmap() to make all hugetlb handling use dedicated hugetlb_* rmap functions. Add sanity checks that we end up with the right folios in the right functions. Note that is_device_private_page() does not apply to hugetlb. Link: https://lkml.kernel.org/r/20231220224504.646757-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Yin Fengwei Reviewed-by: Ryan Roberts Reviewed-by: Muchun Song Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 +++++++++--- include/linux/rmap.h | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2563ffdb51bc7..75bba61028256 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1964,15 +1964,21 @@ static inline bool page_maybe_dma_pinned(struct page *page) * * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq. */ -static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, - struct page *page) +static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma, + struct folio *folio) { VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) return false; - return page_maybe_dma_pinned(page); + return folio_maybe_dma_pinned(folio); +} + +static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, + struct page *page) +{ + return folio_needs_cow_for_dma(vma, page_folio(page)); } /** diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 56900a16f41a6..5f26752de945c 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -211,6 +211,22 @@ void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); +/* See page_try_dup_anon_rmap() */ +static inline int hugetlb_try_dup_anon_rmap(struct folio *folio, + struct vm_area_struct *vma) +{ + VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); + VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); + + if (PageAnonExclusive(&folio->page)) { + if (unlikely(folio_needs_cow_for_dma(vma, folio))) + return -EBUSY; + ClearPageAnonExclusive(&folio->page); + } + atomic_inc(&folio->_entire_mapcount); + return 0; +} + static inline void hugetlb_add_file_rmap(struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); @@ -228,6 +244,8 @@ static inline void hugetlb_remove_rmap(struct folio *folio) static inline void __page_dup_rmap(struct page *page, bool compound) { + VM_WARN_ON(folio_test_hugetlb(page_folio(page))); + if (compound) { struct folio *folio = (struct folio *)page; -- cgit v1.2.3 From 0c2ec32bf0b2f0d7ccb98c53ee5d255d68e73595 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:29 +0100 Subject: mm/rmap: introduce and use hugetlb_try_share_anon_rmap() hugetlb rmap handling differs quite a lot from "ordinary" rmap code. For example, hugetlb currently only supports entire mappings, and treats any mapping as mapped using a single "logical PTE". Let's move it out of the way so we can overhaul our "ordinary" rmap. implementation/interface. So let's introduce and use hugetlb_try_dup_anon_rmap() to make all hugetlb handling use dedicated hugetlb_* rmap functions. Add sanity checks that we end up with the right folios in the right functions. Note that try_to_unmap_one() does not need care. Easy to spot because among all that nasty hugetlb special-casing in that function, we're not using set_huge_pte_at() on the anon path -- well, and that code assumes that we would want to swapout. Link: https://lkml.kernel.org/r/20231220224504.646757-6-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Yin Fengwei Reviewed-by: Ryan Roberts Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/rmap.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 5f26752de945c..d6fefa0f04105 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -227,6 +227,30 @@ static inline int hugetlb_try_dup_anon_rmap(struct folio *folio, return 0; } +/* See page_try_share_anon_rmap() */ +static inline int hugetlb_try_share_anon_rmap(struct folio *folio) +{ + VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); + VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); + VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio); + + /* Paired with the memory barrier in try_grab_folio(). */ + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + smp_mb(); + + if (unlikely(folio_maybe_dma_pinned(folio))) + return -EBUSY; + ClearPageAnonExclusive(&folio->page); + + /* + * This is conceptually a smp_wmb() paired with the smp_rmb() in + * gup_must_unshare(). + */ + if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) + smp_mb__after_atomic(); + return 0; +} + static inline void hugetlb_add_file_rmap(struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); @@ -331,6 +355,7 @@ dup: */ static inline int page_try_share_anon_rmap(struct page *page) { + VM_WARN_ON(folio_test_hugetlb(page_folio(page))); VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page); /* device private pages cannot get pinned via GUP. */ -- cgit v1.2.3 From 68f0320824fa59c5429cbc811e6c46e7a30ea32c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:31 +0100 Subject: mm/rmap: convert folio_add_file_rmap_range() into folio_add_file_rmap_[pte|ptes|pmd]() Let's get rid of the compound parameter and instead define explicitly which mappings we're adding. That is more future proof, easier to read and harder to mess up. Use an enum to express the granularity internally. Make the compiler always special-case on the granularity by using __always_inline. Replace the "compound" check by a switch-case that will be removed by the compiler completely. Add plenty of sanity checks with CONFIG_DEBUG_VM. Replace the folio_test_pmd_mappable() check by a config check in the caller and sanity checks. Convert the single user of folio_add_file_rmap_range(). While at it, consistently use "int" instead of "unisgned int" in rmap code when dealing with mapcounts and the number of pages. This function design can later easily be extended to PUDs and to batch PMDs. Note that for now we don't support anything bigger than PMD-sized folios (as we cleanly separated hugetlb handling). Sanity checks will catch if that ever changes. Next up is removing page_remove_rmap() along with its "compound" parameter and smilarly converting all other rmap functions. Link: https://lkml.kernel.org/r/20231220224504.646757-8-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Yin Fengwei Reviewed-by: Ryan Roberts Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/rmap.h | 46 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index d6fefa0f04105..3d86a76b28368 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -191,6 +191,44 @@ typedef int __bitwise rmap_t; */ #define RMAP_COMPOUND ((__force rmap_t)BIT(1)) +/* + * Internally, we're using an enum to specify the granularity. We make the + * compiler emit specialized code for each granularity. + */ +enum rmap_level { + RMAP_LEVEL_PTE = 0, + RMAP_LEVEL_PMD, +}; + +static inline void __folio_rmap_sanity_checks(struct folio *folio, + struct page *page, int nr_pages, enum rmap_level level) +{ + /* hugetlb folios are handled separately. */ + VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); + VM_WARN_ON_FOLIO(folio_test_large(folio) && + !folio_test_large_rmappable(folio), folio); + + VM_WARN_ON_ONCE(nr_pages <= 0); + VM_WARN_ON_FOLIO(page_folio(page) != folio, folio); + VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio); + + switch (level) { + case RMAP_LEVEL_PTE: + break; + case RMAP_LEVEL_PMD: + /* + * We don't support folios larger than a single PMD yet. So + * when RMAP_LEVEL_PMD is set, we assume that we are creating + * a single "entire" mapping of the folio. + */ + VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); + VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); + break; + default: + VM_WARN_ON_ONCE(true); + } +} + /* * rmap interfaces called when adding or removing pte of page */ @@ -201,8 +239,12 @@ void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); void page_add_file_rmap(struct page *, struct vm_area_struct *, bool compound); -void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr, - struct vm_area_struct *, bool compound); +void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, + struct vm_area_struct *); +#define folio_add_file_rmap_pte(folio, page, vma) \ + folio_add_file_rmap_ptes(folio, page, 1, vma) +void folio_add_file_rmap_pmd(struct folio *, struct page *, + struct vm_area_struct *); void page_remove_rmap(struct page *, struct vm_area_struct *, bool compound); -- cgit v1.2.3 From be6e57cfabe99a5d3b3869103c4ea0ed4a9692d4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:36 +0100 Subject: mm/rmap: remove page_add_file_rmap() All users are gone, let's remove it. Link: https://lkml.kernel.org/r/20231220224504.646757-13-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Yin Fengwei Reviewed-by: Ryan Roberts Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 3d86a76b28368..6a4db6933e7df 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -237,8 +237,6 @@ void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); -void page_add_file_rmap(struct page *, struct vm_area_struct *, - bool compound); void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, struct vm_area_struct *); #define folio_add_file_rmap_pte(folio, page, vma) \ -- cgit v1.2.3 From 8bd5130070fbf2247a97c5361427a810522ac98a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:38 +0100 Subject: mm/rmap: introduce folio_add_anon_rmap_[pte|ptes|pmd]() Let's mimic what we did with folio_add_file_rmap_*() so we can similarly replace page_add_anon_rmap() next. Make the compiler always special-case on the granularity by using __always_inline. For the PageAnonExclusive sanity checks, when adding a PMD mapping, we're now also checking each individual subpage covered by that PMD, instead of only the head page. Note that the new functions ignore the RMAP_COMPOUND flag, which we will remove as soon as page_add_anon_rmap() is gone. Link: https://lkml.kernel.org/r/20231220224504.646757-15-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Yin Fengwei Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/rmap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 6a4db6933e7df..b5da3d86200e4 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -233,6 +233,12 @@ static inline void __folio_rmap_sanity_checks(struct folio *folio, * rmap interfaces called when adding or removing pte of page */ void folio_move_anon_rmap(struct folio *, struct vm_area_struct *); +void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages, + struct vm_area_struct *, unsigned long address, rmap_t flags); +#define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \ + folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags) +void folio_add_anon_rmap_pmd(struct folio *, struct page *, + struct vm_area_struct *, unsigned long address, rmap_t flags); void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, -- cgit v1.2.3 From 84f0169e6c8a613012722e0d63302f9da4a72099 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:45 +0100 Subject: mm/rmap: remove page_add_anon_rmap() All users are gone, remove it and all traces. Link: https://lkml.kernel.org/r/20231220224504.646757-22-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b5da3d86200e4..fe7b5a8b0e75b 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -239,8 +239,6 @@ void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages, folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags) void folio_add_anon_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); -void page_add_anon_rmap(struct page *, struct vm_area_struct *, - unsigned long address, rmap_t flags); void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, -- cgit v1.2.3 From 0cae959e3abf19ba62805f6e6a8b42b6cd9ed3e3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:46 +0100 Subject: mm/rmap: remove RMAP_COMPOUND No longer used, let's remove it and clarify RMAP_NONE/RMAP_EXCLUSIVE a bit. Link: https://lkml.kernel.org/r/20231220224504.646757-23-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/rmap.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index fe7b5a8b0e75b..bf6cb79aa7a0a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -177,20 +177,14 @@ struct anon_vma *folio_get_anon_vma(struct folio *folio); typedef int __bitwise rmap_t; /* - * No special request: if the page is a subpage of a compound page, it is - * mapped via a PTE. The mapped (sub)page is possibly shared between processes. + * No special request: A mapped anonymous (sub)page is possibly shared between + * processes. */ #define RMAP_NONE ((__force rmap_t)0) -/* The (sub)page is exclusive to a single process. */ +/* The anonymous (sub)page is exclusive to a single process. */ #define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0)) -/* - * The compound page is not mapped via PTEs, but instead via a single PMD and - * should be accounted accordingly. - */ -#define RMAP_COMPOUND ((__force rmap_t)BIT(1)) - /* * Internally, we're using an enum to specify the granularity. We make the * compiler emit specialized code for each granularity. -- cgit v1.2.3 From b06dc281aa9901076898d4d0a7bde588f11bc204 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:47 +0100 Subject: mm/rmap: introduce folio_remove_rmap_[pte|ptes|pmd]() Let's mimic what we did with folio_add_file_rmap_*() and folio_add_anon_rmap_*() so we can similarly replace page_remove_rmap() next. Make the compiler always special-case on the granularity by using __always_inline. We're adding folio_remove_rmap_ptes() handling right away, as we want to use that soon for batching rmap operations when unmapping PTE-mapped large folios. Link: https://lkml.kernel.org/r/20231220224504.646757-24-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/rmap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bf6cb79aa7a0a..57e045093f047 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -243,6 +243,12 @@ void folio_add_file_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *); void page_remove_rmap(struct page *, struct vm_area_struct *, bool compound); +void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages, + struct vm_area_struct *); +#define folio_remove_rmap_pte(folio, page, vma) \ + folio_remove_rmap_ptes(folio, page, 1, vma) +void folio_remove_rmap_pmd(struct folio *, struct page *, + struct vm_area_struct *); void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address, rmap_t flags); -- cgit v1.2.3 From 4d8f7418e8ba36036c8486d92d9591c368ab9b85 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:56 +0100 Subject: mm/rmap: remove page_remove_rmap() All callers are gone, let's remove it and some leftover traces. Link: https://lkml.kernel.org/r/20231220224504.646757-33-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/rmap.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 57e045093f047..fef369e37039a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -241,8 +241,6 @@ void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, folio_add_file_rmap_ptes(folio, page, 1, vma) void folio_add_file_rmap_pmd(struct folio *, struct page *, struct vm_area_struct *); -void page_remove_rmap(struct page *, struct vm_area_struct *, - bool compound); void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages, struct vm_area_struct *); #define folio_remove_rmap_pte(folio, page, vma) \ @@ -389,7 +387,7 @@ dup: * * This is similar to page_try_dup_anon_rmap(), however, not used during fork() * to duplicate a mapping, but instead to prepare for KSM or temporarily - * unmapping a page (swap, migration) via page_remove_rmap(). + * unmapping a page (swap, migration) via folio_remove_rmap_*(). * * Marking the page shared can only fail if the page may be pinned; device * private pages cannot get pinned and consequently this function cannot fail. -- cgit v1.2.3 From d8ef5e311d7bfde54b60ab45026f206eff31b2d2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:57 +0100 Subject: mm/rmap: convert page_dup_file_rmap() to folio_dup_file_rmap_[pte|ptes|pmd]() Let's convert page_dup_file_rmap() like the other rmap functions. As there is only a single caller, convert that single caller right away and remove page_dup_file_rmap(). Add folio_dup_file_rmap_ptes() right away, we want to perform rmap baching during fork() soon. Link: https://lkml.kernel.org/r/20231220224504.646757-34-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/rmap.h | 59 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index fef369e37039a..7607f862e795d 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -308,6 +308,60 @@ static inline void hugetlb_remove_rmap(struct folio *folio) atomic_dec(&folio->_entire_mapcount); } +static __always_inline void __folio_dup_file_rmap(struct folio *folio, + struct page *page, int nr_pages, enum rmap_level level) +{ + __folio_rmap_sanity_checks(folio, page, nr_pages, level); + + switch (level) { + case RMAP_LEVEL_PTE: + do { + atomic_inc(&page->_mapcount); + } while (page++, --nr_pages > 0); + break; + case RMAP_LEVEL_PMD: + atomic_inc(&folio->_entire_mapcount); + break; + } +} + +/** + * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio + * @folio: The folio to duplicate the mappings of + * @page: The first page to duplicate the mappings of + * @nr_pages: The number of pages of which the mapping will be duplicated + * + * The page range of the folio is defined by [page, page + nr_pages) + * + * The caller needs to hold the page table lock. + */ +static inline void folio_dup_file_rmap_ptes(struct folio *folio, + struct page *page, int nr_pages) +{ + __folio_dup_file_rmap(folio, page, nr_pages, RMAP_LEVEL_PTE); +} +#define folio_dup_file_rmap_pte(folio, page) \ + folio_dup_file_rmap_ptes(folio, page, 1) + +/** + * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio + * @folio: The folio to duplicate the mapping of + * @page: The first page to duplicate the mapping of + * + * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) + * + * The caller needs to hold the page table lock. + */ +static inline void folio_dup_file_rmap_pmd(struct folio *folio, + struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, RMAP_LEVEL_PTE); +#else + WARN_ON_ONCE(true); +#endif +} + static inline void __page_dup_rmap(struct page *page, bool compound) { VM_WARN_ON(folio_test_hugetlb(page_folio(page))); @@ -322,11 +376,6 @@ static inline void __page_dup_rmap(struct page *page, bool compound) } } -static inline void page_dup_file_rmap(struct page *page, bool compound) -{ - __page_dup_rmap(page, compound); -} - /** * page_try_dup_anon_rmap - try duplicating a mapping of an already mapped * anonymous page -- cgit v1.2.3 From 61d90309b7156d54c5d358cb5d8bf55b33d233d2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:44:58 +0100 Subject: mm/rmap: introduce folio_try_dup_anon_rmap_[pte|ptes|pmd]() The last user of page_needs_cow_for_dma() and __page_dup_rmap() are gone, remove them. Add folio_try_dup_anon_rmap_ptes() right away, we want to perform rmap baching during fork() soon. Link: https://lkml.kernel.org/r/20231220224504.646757-35-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 --- include/linux/rmap.h | 150 ++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 106 insertions(+), 50 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 75bba61028256..896c0079f64f2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1975,12 +1975,6 @@ static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma, return folio_maybe_dma_pinned(folio); } -static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, - struct page *page) -{ - return folio_needs_cow_for_dma(vma, page_folio(page)); -} - /** * is_zero_page - Query if a page is a zero page * @page: The page to query diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 7607f862e795d..850aa74b6724c 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -362,68 +362,130 @@ static inline void folio_dup_file_rmap_pmd(struct folio *folio, #endif } -static inline void __page_dup_rmap(struct page *page, bool compound) +static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, + struct page *page, int nr_pages, struct vm_area_struct *src_vma, + enum rmap_level level) { - VM_WARN_ON(folio_test_hugetlb(page_folio(page))); + bool maybe_pinned; + int i; + + VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); + __folio_rmap_sanity_checks(folio, page, nr_pages, level); - if (compound) { - struct folio *folio = (struct folio *)page; + /* + * If this folio may have been pinned by the parent process, + * don't allow to duplicate the mappings but instead require to e.g., + * copy the subpage immediately for the child so that we'll always + * guarantee the pinned folio won't be randomly replaced in the + * future on write faults. + */ + maybe_pinned = likely(!folio_is_device_private(folio)) && + unlikely(folio_needs_cow_for_dma(src_vma, folio)); - VM_BUG_ON_PAGE(compound && !PageHead(page), page); + /* + * No need to check+clear for already shared PTEs/PMDs of the + * folio. But if any page is PageAnonExclusive, we must fallback to + * copying if the folio maybe pinned. + */ + switch (level) { + case RMAP_LEVEL_PTE: + if (unlikely(maybe_pinned)) { + for (i = 0; i < nr_pages; i++) + if (PageAnonExclusive(page + i)) + return -EBUSY; + } + do { + if (PageAnonExclusive(page)) + ClearPageAnonExclusive(page); + atomic_inc(&page->_mapcount); + } while (page++, --nr_pages > 0); + break; + case RMAP_LEVEL_PMD: + if (PageAnonExclusive(page)) { + if (unlikely(maybe_pinned)) + return -EBUSY; + ClearPageAnonExclusive(page); + } atomic_inc(&folio->_entire_mapcount); - } else { - atomic_inc(&page->_mapcount); + break; } + return 0; } /** - * page_try_dup_anon_rmap - try duplicating a mapping of an already mapped - * anonymous page - * @page: the page to duplicate the mapping for - * @compound: the page is mapped as compound or as a small page - * @vma: the source vma + * folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range + * of a folio + * @folio: The folio to duplicate the mappings of + * @page: The first page to duplicate the mappings of + * @nr_pages: The number of pages of which the mapping will be duplicated + * @src_vma: The vm area from which the mappings are duplicated * - * The caller needs to hold the PT lock and the vma->vma_mm->write_protect_seq. + * The page range of the folio is defined by [page, page + nr_pages) * - * Duplicating the mapping can only fail if the page may be pinned; device - * private pages cannot get pinned and consequently this function cannot fail. + * The caller needs to hold the page table lock and the + * vma->vma_mm->write_protect_seq. + * + * Duplicating the mappings can only fail if the folio may be pinned; device + * private folios cannot get pinned and consequently this function cannot fail + * for them. + * + * If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in + * the parent and the child. They must *not* be writable after this call + * succeeded. + * + * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise. + */ +static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio, + struct page *page, int nr_pages, struct vm_area_struct *src_vma) +{ + return __folio_try_dup_anon_rmap(folio, page, nr_pages, src_vma, + RMAP_LEVEL_PTE); +} +#define folio_try_dup_anon_rmap_pte(folio, page, vma) \ + folio_try_dup_anon_rmap_ptes(folio, page, 1, vma) + +/** + * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range + * of a folio + * @folio: The folio to duplicate the mapping of + * @page: The first page to duplicate the mapping of + * @src_vma: The vm area from which the mapping is duplicated + * + * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) * - * If duplicating the mapping succeeds, the page has to be mapped R/O into - * the parent and the child. It must *not* get mapped writable after this call. + * The caller needs to hold the page table lock and the + * vma->vma_mm->write_protect_seq. + * + * Duplicating the mapping can only fail if the folio may be pinned; device + * private folios cannot get pinned and consequently this function cannot fail + * for them. + * + * If duplicating the mapping succeeds, the duplicated PMD has to be R/O in + * the parent and the child. They must *not* be writable after this call + * succeeded. * * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise. */ +static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, + struct page *page, struct vm_area_struct *src_vma) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, src_vma, + RMAP_LEVEL_PMD); +#else + WARN_ON_ONCE(true); + return -EBUSY; +#endif +} + static inline int page_try_dup_anon_rmap(struct page *page, bool compound, struct vm_area_struct *vma) { - VM_BUG_ON_PAGE(!PageAnon(page), page); - - /* - * No need to check+clear for already shared pages, including KSM - * pages. - */ - if (!PageAnonExclusive(page)) - goto dup; - - /* - * If this page may have been pinned by the parent process, - * don't allow to duplicate the mapping but instead require to e.g., - * copy the page immediately for the child so that we'll always - * guarantee the pinned page won't be randomly replaced in the - * future on write faults. - */ - if (likely(!is_device_private_page(page)) && - unlikely(page_needs_cow_for_dma(vma, page))) - return -EBUSY; + struct folio *folio = page_folio(page); - ClearPageAnonExclusive(page); - /* - * It's okay to share the anon page between both processes, mapping - * the page R/O into both processes. - */ -dup: - __page_dup_rmap(page, compound); - return 0; + if (likely(!compound)) + return folio_try_dup_anon_rmap_pte(folio, page, vma); + return folio_try_dup_anon_rmap_pmd(folio, page, vma); } /** -- cgit v1.2.3 From a13d096471ec0ac5c6fc90fbcd57e8430024046a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:45:01 +0100 Subject: mm/rmap: remove page_try_dup_anon_rmap() All users are gone, remove page_try_dup_anon_rmap() and any remaining traces. Link: https://lkml.kernel.org/r/20231220224504.646757-38-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/rmap.h | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 850aa74b6724c..0ad2ea2734e4a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -253,7 +253,7 @@ void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); -/* See page_try_dup_anon_rmap() */ +/* See folio_try_dup_anon_rmap_*() */ static inline int hugetlb_try_dup_anon_rmap(struct folio *folio, struct vm_area_struct *vma) { @@ -478,16 +478,6 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, #endif } -static inline int page_try_dup_anon_rmap(struct page *page, bool compound, - struct vm_area_struct *vma) -{ - struct folio *folio = page_folio(page); - - if (likely(!compound)) - return folio_try_dup_anon_rmap_pte(folio, page, vma); - return folio_try_dup_anon_rmap_pmd(folio, page, vma); -} - /** * page_try_share_anon_rmap - try marking an exclusive anonymous page possibly * shared to prepare for KSM or temporary unmapping @@ -496,8 +486,8 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound, * The caller needs to hold the PT lock and has to have the page table entry * cleared/invalidated. * - * This is similar to page_try_dup_anon_rmap(), however, not used during fork() - * to duplicate a mapping, but instead to prepare for KSM or temporarily + * This is similar to folio_try_dup_anon_rmap_*(), however, not used during + * fork() to duplicate a mapping, but instead to prepare for KSM or temporarily * unmapping a page (swap, migration) via folio_remove_rmap_*(). * * Marking the page shared can only fail if the page may be pinned; device -- cgit v1.2.3 From e3b4b1374f87c71e9309efc6149f113cdd17af72 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 20 Dec 2023 23:45:02 +0100 Subject: mm: convert page_try_share_anon_rmap() to folio_try_share_anon_rmap_[pte|pmd]() Let's convert it like we converted all the other rmap functions. Don't introduce folio_try_share_anon_rmap_ptes() for now, as we don't have a user that wants rmap batching in sight. Pretty easy to add later. All users are easy to convert -- only ksm.c doesn't use folios yet but that is left for future work -- so let's just do it in a single shot. While at it, turn the BUG_ON into a WARN_ON_ONCE. Note that page_try_share_anon_rmap() so far didn't care about pte/pmd mappings (no compound parameter). We're changing that so we can perform better sanity checks and make the code actually more readable/consistent. For example, __folio_rmap_sanity_checks() will make sure that a PMD range actually falls completely into the folio. Link: https://lkml.kernel.org/r/20231220224504.646757-39-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Muchun Song Cc: Peter Xu Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/rmap.h | 96 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 71 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 0ad2ea2734e4a..fd6fe16fa3583 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -269,7 +269,7 @@ static inline int hugetlb_try_dup_anon_rmap(struct folio *folio, return 0; } -/* See page_try_share_anon_rmap() */ +/* See folio_try_share_anon_rmap_*() */ static inline int hugetlb_try_share_anon_rmap(struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); @@ -478,31 +478,15 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, #endif } -/** - * page_try_share_anon_rmap - try marking an exclusive anonymous page possibly - * shared to prepare for KSM or temporary unmapping - * @page: the exclusive anonymous page to try marking possibly shared - * - * The caller needs to hold the PT lock and has to have the page table entry - * cleared/invalidated. - * - * This is similar to folio_try_dup_anon_rmap_*(), however, not used during - * fork() to duplicate a mapping, but instead to prepare for KSM or temporarily - * unmapping a page (swap, migration) via folio_remove_rmap_*(). - * - * Marking the page shared can only fail if the page may be pinned; device - * private pages cannot get pinned and consequently this function cannot fail. - * - * Returns 0 if marking the page possibly shared succeeded. Returns -EBUSY - * otherwise. - */ -static inline int page_try_share_anon_rmap(struct page *page) +static __always_inline int __folio_try_share_anon_rmap(struct folio *folio, + struct page *page, int nr_pages, enum rmap_level level) { - VM_WARN_ON(folio_test_hugetlb(page_folio(page))); - VM_BUG_ON_PAGE(!PageAnon(page) || !PageAnonExclusive(page), page); + VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); + VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio); + __folio_rmap_sanity_checks(folio, page, nr_pages, level); - /* device private pages cannot get pinned via GUP. */ - if (unlikely(is_device_private_page(page))) { + /* device private folios cannot get pinned via GUP. */ + if (unlikely(folio_is_device_private(folio))) { ClearPageAnonExclusive(page); return 0; } @@ -553,7 +537,7 @@ static inline int page_try_share_anon_rmap(struct page *page) if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) smp_mb(); - if (unlikely(page_maybe_dma_pinned(page))) + if (unlikely(folio_maybe_dma_pinned(folio))) return -EBUSY; ClearPageAnonExclusive(page); @@ -566,6 +550,68 @@ static inline int page_try_share_anon_rmap(struct page *page) return 0; } +/** + * folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page + * mapped by a PTE possibly shared to prepare + * for KSM or temporary unmapping + * @folio: The folio to share a mapping of + * @page: The mapped exclusive page + * + * The caller needs to hold the page table lock and has to have the page table + * entries cleared/invalidated. + * + * This is similar to folio_try_dup_anon_rmap_pte(), however, not used during + * fork() to duplicate mappings, but instead to prepare for KSM or temporarily + * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte(). + * + * Marking the mapped page shared can only fail if the folio maybe pinned; + * device private folios cannot get pinned and consequently this function cannot + * fail. + * + * Returns 0 if marking the mapped page possibly shared succeeded. Returns + * -EBUSY otherwise. + */ +static inline int folio_try_share_anon_rmap_pte(struct folio *folio, + struct page *page) +{ + return __folio_try_share_anon_rmap(folio, page, 1, RMAP_LEVEL_PTE); +} + +/** + * folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page + * range mapped by a PMD possibly shared to + * prepare for temporary unmapping + * @folio: The folio to share the mapping of + * @page: The first page to share the mapping of + * + * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) + * + * The caller needs to hold the page table lock and has to have the page table + * entries cleared/invalidated. + * + * This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during + * fork() to duplicate a mapping, but instead to prepare for temporarily + * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd(). + * + * Marking the mapped pages shared can only fail if the folio maybe pinned; + * device private folios cannot get pinned and consequently this function cannot + * fail. + * + * Returns 0 if marking the mapped pages possibly shared succeeded. Returns + * -EBUSY otherwise. + */ +static inline int folio_try_share_anon_rmap_pmd(struct folio *folio, + struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR, + RMAP_LEVEL_PMD); +#else + WARN_ON_ONCE(true); + return -EBUSY; +#endif +} + /* * Called from mm/vmscan.c to handle paging out */ -- cgit v1.2.3 From 90ca22513ed5d7cf546c7c8d35a03ec2a2f5c87e Mon Sep 17 00:00:00 2001 From: Mathis Marion Date: Tue, 19 Dec 2023 14:11:54 +0100 Subject: lib: crc_ccitt_false() is identical to crc_itu_t() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit crc_ccitt_false() was introduced in commit 0d85adb5fbd33 ("lib/crc-ccitt: Add CCITT-FALSE CRC16 variant"), but it is redundant with crc_itu_t(). Since the latter is more used, it is the one being kept. Link: https://lkml.kernel.org/r/20231219131154.748577-1-Mathis.Marion@silabs.com Signed-off-by: Mathis Marion Cc: Andrey Smirnov Cc: Andrey Vostrikov Cc: Jérôme Pouiller Signed-off-by: Andrew Morton --- include/linux/crc-ccitt.h | 7 ------- include/linux/surface_aggregator/serial_hub.h | 4 ++-- 2 files changed, 2 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crc-ccitt.h b/include/linux/crc-ccitt.h index 72c92c396bb8d..cd4f420231bab 100644 --- a/include/linux/crc-ccitt.h +++ b/include/linux/crc-ccitt.h @@ -5,19 +5,12 @@ #include extern u16 const crc_ccitt_table[256]; -extern u16 const crc_ccitt_false_table[256]; extern u16 crc_ccitt(u16 crc, const u8 *buffer, size_t len); -extern u16 crc_ccitt_false(u16 crc, const u8 *buffer, size_t len); static inline u16 crc_ccitt_byte(u16 crc, const u8 c) { return (crc >> 8) ^ crc_ccitt_table[(crc ^ c) & 0xff]; } -static inline u16 crc_ccitt_false_byte(u16 crc, const u8 c) -{ - return (crc << 8) ^ crc_ccitt_false_table[(crc >> 8) ^ c]; -} - #endif /* _LINUX_CRC_CCITT_H */ diff --git a/include/linux/surface_aggregator/serial_hub.h b/include/linux/surface_aggregator/serial_hub.h index 5c4ae1a261831..d8dbef6b7fc20 100644 --- a/include/linux/surface_aggregator/serial_hub.h +++ b/include/linux/surface_aggregator/serial_hub.h @@ -12,7 +12,7 @@ #ifndef _LINUX_SURFACE_AGGREGATOR_SERIAL_HUB_H #define _LINUX_SURFACE_AGGREGATOR_SERIAL_HUB_H -#include +#include #include #include #include @@ -188,7 +188,7 @@ static_assert(sizeof(struct ssh_command) == 8); */ static inline u16 ssh_crc(const u8 *buf, size_t len) { - return crc_ccitt_false(0xffff, buf, len); + return crc_itu_t(0xffff, buf, len); } /* -- cgit v1.2.3 From 29166371ef6780429e4cb84f1827fafbdd4005ab Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Wed, 27 Dec 2023 07:46:25 +0800 Subject: kdump: remove redundant DEFAULT_CRASH_KERNEL_LOW_SIZE Remove duplicate definitions, no functional changes. Link: https://lkml.kernel.org/r/MW4PR84MB3145459ADC7EB38BBB36955B8198A@MW4PR84MB3145.NAMPRD84.PROD.OUTLOOK.COM Signed-off-by: Youling Tang Reported-by: Huacai Chen Acked-by: Baoquan He Signed-off-by: Andrew Morton --- include/linux/crash_core.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 5126a4fecb442..9eaeaafe0cad3 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -87,12 +87,6 @@ Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len); void final_note(Elf_Word *buf); -#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION -#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE -#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) -#endif -#endif - int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base, unsigned long long *low_size, bool *high); -- cgit v1.2.3 From 501a06fe8e4c185bbda371b8cedbdf1b23a633d8 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Thu, 7 Dec 2023 11:24:06 -0800 Subject: zswap: memcontrol: implement zswap writeback disabling During our experiment with zswap, we sometimes observe swap IOs due to occasional zswap store failures and writebacks-to-swap. These swapping IOs prevent many users who cannot tolerate swapping from adopting zswap to save memory and improve performance where possible. This patch adds the option to disable this behavior entirely: do not writeback to backing swapping device when a zswap store attempt fail, and do not write pages in the zswap pool back to the backing swap device (both when the pool is full, and when the new zswap shrinker is called). This new behavior can be opted-in/out on a per-cgroup basis via a new cgroup file. By default, writebacks to swap device is enabled, which is the previous behavior. Initially, writeback is enabled for the root cgroup, and a newly created cgroup will inherit the current setting of its parent. Note that this is subtly different from setting memory.swap.max to 0, as it still allows for pages to be stored in the zswap pool (which itself consumes swap space in its current form). This patch should be applied on top of the zswap shrinker series: https://lore.kernel.org/linux-mm/20231130194023.4102148-1-nphamcs@gmail.com/ as it also disables the zswap shrinker, a major source of zswap writebacks. For the most part, this feature is motivated by internal parties who have already established their opinions regarding swapping - the workloads that are highly sensitive to IO, and especially those who are using servers with really slow disk performance (for instance, massive but slow HDDs). For these folks, it's impossible to convince them to even entertain zswap if swapping also comes as a packaged deal. Writeback disabling is quite a useful feature in these situations - on a mixed workloads deployment, they can disable writeback for the more IO-sensitive workloads, and enable writeback for other background workloads. For instance, on a server with HDD, I allocate memories and populate them with random values (so that zswap store will always fail), and specify memory.high low enough to trigger reclaim. The time it takes to allocate the memories and just read through it a couple of times (doing silly things like computing the values' average etc.): zswap.writeback disabled: real 0m30.537s user 0m23.687s sys 0m6.637s 0 pages swapped in 0 pages swapped out zswap.writeback enabled: real 0m45.061s user 0m24.310s sys 0m8.892s 712686 pages swapped in 461093 pages swapped out (the last two lines are from vmstat -s). [nphamcs@gmail.com: add a comment about recurring zswap store failures leading to reclaim inefficiency] Link: https://lkml.kernel.org/r/20231221005725.3446672-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231207192406.3809579-1-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: Johannes Weiner Reviewed-by: Yosry Ahmed Acked-by: Chris Li Cc: Dan Streetman Cc: David Heidelberg Cc: Domenico Cerasuolo Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Konrad Rzeszutek Wilk Cc: Michal Hocko Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Seth Jennings Cc: Shakeel Butt Cc: Tejun Heo Cc: Vitaly Wool Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 12 ++++++++++++ include/linux/zswap.h | 7 +++++++ 2 files changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 43b77363ab8e7..5de775e6cdd91 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -219,6 +219,12 @@ struct mem_cgroup { #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) unsigned long zswap_max; + + /* + * Prevent pages from this memcg from being written back from zswap to + * swap, and from being swapped out on zswap store failures. + */ + bool zswap_writeback; #endif unsigned long soft_limit; @@ -1941,6 +1947,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, bool obj_cgroup_may_zswap(struct obj_cgroup *objcg); void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size); void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size); +bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg); #else static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) { @@ -1954,6 +1961,11 @@ static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) { } +static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg) +{ + /* if zswap is disabled, do not block pages going to the swapping device */ + return true; +} #endif #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/zswap.h b/include/linux/zswap.h index e88572d4c7202..0b709f5bc65fa 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -35,6 +35,7 @@ void zswap_swapoff(int type); void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg); void zswap_lruvec_state_init(struct lruvec *lruvec); void zswap_folio_swapin(struct folio *folio); +bool is_zswap_enabled(void); #else struct zswap_lruvec_state {}; @@ -55,6 +56,12 @@ static inline void zswap_swapoff(int type) {} static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {} static inline void zswap_lruvec_state_init(struct lruvec *lruvec) {} static inline void zswap_folio_swapin(struct folio *folio) {} + +static inline bool is_zswap_enabled(void) +{ + return false; +} + #endif #endif /* _LINUX_ZSWAP_H */ -- cgit v1.2.3 From 7c223098212957a1ecd8768e8e747ae2cf88e880 Mon Sep 17 00:00:00 2001 From: David Laight Date: Fri, 29 Dec 2023 20:53:49 +0000 Subject: locking/osq_lock: Move the definition of optimistic_spin_node into osq_lock.c struct optimistic_spin_node is private to the implementation. Move it into the C file to ensure nothing is accessing it. Signed-off-by: David Laight Acked-by: Waiman Long Signed-off-by: Linus Torvalds --- include/linux/osq_lock.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h index 5581dbd3bd340..ea8fb31379e3c 100644 --- a/include/linux/osq_lock.h +++ b/include/linux/osq_lock.h @@ -6,11 +6,6 @@ * An MCS like lock especially tailored for optimistic spinning for sleeping * lock implementations (mutex, rwsem, etc). */ -struct optimistic_spin_node { - struct optimistic_spin_node *next, *prev; - int locked; /* 1 if lock acquired */ - int cpu; /* encoded CPU # + 1 value */ -}; struct optimistic_spin_queue { /* -- cgit v1.2.3 From cff9c565e65f3622e8dc1dcc21c1520a083dff35 Mon Sep 17 00:00:00 2001 From: Luiz Angelo Daros de Luca Date: Wed, 20 Dec 2023 01:52:29 -0300 Subject: net: mdio: get/put device node during (un)registration The __of_mdiobus_register() function was storing the device node in dev.of_node without increasing its reference count. It implicitly relied on the caller to maintain the allocated node until the mdiobus was unregistered. Now, __of_mdiobus_register() will acquire the node before assigning it, and of_mdiobus_unregister_callback() will be called at the end of mdio_unregister(). Drivers can now release the node immediately after MDIO registration. Some of them are already doing that even before this patch. Signed-off-by: Luiz Angelo Daros de Luca Signed-off-by: David S. Miller --- include/linux/phy.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index e9e85d3475872..ede891776d8b0 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -434,6 +434,9 @@ struct mii_bus { /** @shared: shared state across different PHYs */ struct phy_package_shared *shared[PHY_MAX_ADDR]; + + /** @__unregister_callback: called at the last step of unregistration */ + void (*__unregister_callback)(struct mii_bus *bus); }; #define to_mii_bus(d) container_of(d, struct mii_bus, dev) -- cgit v1.2.3 From 02018c544ef113e980a2349eba89003d6f399d22 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 21 Dec 2023 19:00:34 +0100 Subject: net: phy: Introduce ethernet link topology representation Link topologies containing multiple network PHYs attached to the same net_device can be found when using a PHY as a media converter for use with an SFP connector, on which an SFP transceiver containing a PHY can be used. With the current model, the transceiver's PHY can't be used for operations such as cable testing, timestamping, macsec offload, etc. The reason being that most of the logic for these configuration, coming from either ethtool netlink or ioctls tend to use netdev->phydev, which in multi-phy systems will reference the PHY closest to the MAC. Introduce a numbering scheme allowing to enumerate PHY devices that belong to any netdev, which can in turn allow userspace to take more precise decisions with regard to each PHY's configuration. The numbering is maintained per-netdev, in a phy_device_list. The numbering works similarly to a netdevice's ifindex, with identifiers that are only recycled once INT_MAX has been reached. This prevents races that could occur between PHY listing and SFP transceiver removal/insertion. The identifiers are assigned at phy_attach time, as the numbering depends on the netdevice the phy is attached to. Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +- include/linux/phy.h | 4 ++ include/linux/phy_link_topology.h | 67 ++++++++++++++++++++++++++++++++++ include/linux/phy_link_topology_core.h | 19 ++++++++++ 4 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 include/linux/phy_link_topology.h create mode 100644 include/linux/phy_link_topology_core.h (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 75c7725e5e4fd..5baa5517f5330 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -40,7 +40,6 @@ #include #endif #include - #include #include #include @@ -52,6 +51,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -2047,6 +2047,7 @@ enum netdev_stat_type { * @fcoe_ddp_xid: Max exchange id for FCoE LRO by ddp * * @priomap: XXX: need comments on this one + * @link_topo: Physical link topology tracking attached PHYs * @phydev: Physical device may attach itself * for hardware timestamping * @sfp_bus: attached &struct sfp_bus structure. @@ -2441,6 +2442,7 @@ struct net_device { #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) struct netprio_map __rcu *priomap; #endif + struct phy_link_topology link_topo; struct phy_device *phydev; struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; diff --git a/include/linux/phy.h b/include/linux/phy.h index ede891776d8b0..ea9416797b89c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -547,6 +547,9 @@ struct macsec_ops; * @drv: Pointer to the driver for this PHY instance * @devlink: Create a link between phy dev and mac dev, if the external phy * used by current mac interface is managed by another mac interface. + * @phyindex: Unique id across the phy's parent tree of phys to address the PHY + * from userspace, similar to ifindex. A zero index means the PHY + * wasn't assigned an id yet. * @phy_id: UID for this device found during discovery * @c45_ids: 802.3-c45 Device Identifiers if is_c45. * @is_c45: Set to true if this PHY uses clause 45 addressing. @@ -646,6 +649,7 @@ struct phy_device { struct device_link *devlink; + u32 phyindex; u32 phy_id; struct phy_c45_device_ids c45_ids; diff --git a/include/linux/phy_link_topology.h b/include/linux/phy_link_topology.h new file mode 100644 index 0000000000000..91902263ec0ef --- /dev/null +++ b/include/linux/phy_link_topology.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * PHY device list allow maintaining a list of PHY devices that are + * part of a netdevice's link topology. PHYs can for example be chained, + * as is the case when using a PHY that exposes an SFP module, on which an + * SFP transceiver that embeds a PHY is connected. + * + * This list can then be used by userspace to leverage individual PHY + * capabilities. + */ +#ifndef __PHY_LINK_TOPOLOGY_H +#define __PHY_LINK_TOPOLOGY_H + +#include +#include + +struct xarray; +struct phy_device; +struct net_device; +struct sfp_bus; + +struct phy_device_node { + enum phy_upstream upstream_type; + + union { + struct net_device *netdev; + struct phy_device *phydev; + } upstream; + + struct sfp_bus *parent_sfp_bus; + + struct phy_device *phy; +}; + +static inline struct phy_device * +phy_link_topo_get_phy(struct phy_link_topology *topo, u32 phyindex) +{ + struct phy_device_node *pdn = xa_load(&topo->phys, phyindex); + + if (pdn) + return pdn->phy; + + return NULL; +} + +#if IS_ENABLED(CONFIG_PHYLIB) +int phy_link_topo_add_phy(struct phy_link_topology *topo, + struct phy_device *phy, + enum phy_upstream upt, void *upstream); + +void phy_link_topo_del_phy(struct phy_link_topology *lt, struct phy_device *phy); + +#else +static inline int phy_link_topo_add_phy(struct phy_link_topology *topo, + struct phy_device *phy, + enum phy_upstream upt, void *upstream) +{ + return 0; +} + +static inline void phy_link_topo_del_phy(struct phy_link_topology *topo, + struct phy_device *phy) +{ +} +#endif + +#endif /* __PHY_LINK_TOPOLOGY_H */ diff --git a/include/linux/phy_link_topology_core.h b/include/linux/phy_link_topology_core.h new file mode 100644 index 0000000000000..78c75f9094897 --- /dev/null +++ b/include/linux/phy_link_topology_core.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PHY_LINK_TOPOLOGY_CORE_H +#define __PHY_LINK_TOPOLOGY_CORE_H + +struct xarray; + +struct phy_link_topology { + struct xarray phys; + + u32 next_phy_index; +}; + +static inline void phy_link_topo_init(struct phy_link_topology *topo) +{ + xa_init_flags(&topo->phys, XA_FLAGS_ALLOC1); + topo->next_phy_index = 1; +} + +#endif /* __PHY_LINK_TOPOLOGY_CORE_H */ -- cgit v1.2.3 From 9c5625f559ad6fe9f6f733c11475bf470e637d34 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 21 Dec 2023 19:00:35 +0100 Subject: net: sfp: pass the phy_device when disconnecting an sfp module's PHY Pass the phy_device as a parameter to the sfp upstream .disconnect_phy operation. This is preparatory work to help track phy devices across a net_device's link. Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- include/linux/sfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 9346cd44814d6..0573e53b0c11f 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -544,7 +544,7 @@ struct sfp_upstream_ops { void (*link_down)(void *priv); void (*link_up)(void *priv); int (*connect_phy)(void *priv, struct phy_device *); - void (*disconnect_phy)(void *priv); + void (*disconnect_phy)(void *priv, struct phy_device *); }; #if IS_ENABLED(CONFIG_SFP) -- cgit v1.2.3 From 034fcc210349b873ece7356905be5c6ca11eef2a Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 21 Dec 2023 19:00:36 +0100 Subject: net: phy: add helpers to handle sfp phy connect/disconnect There are a few PHY drivers that can handle SFP modules through their sfp_upstream_ops. Introduce Phylib helpers to keep track of connected SFP PHYs in a netdevice's namespace, by adding the SFP PHY to the upstream PHY's netdev's namespace. By doing so, these SFP PHYs can be enumerated and exposed to users, which will be able to use their capabilities. Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- include/linux/phy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index ea9416797b89c..ac22b8e28a853 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1729,6 +1729,8 @@ int phy_suspend(struct phy_device *phydev); int phy_resume(struct phy_device *phydev); int __phy_resume(struct phy_device *phydev); int phy_loopback(struct phy_device *phydev, bool enable); +int phy_sfp_connect_phy(void *upstream, struct phy_device *phy); +void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy); void phy_sfp_attach(void *upstream, struct sfp_bus *bus); void phy_sfp_detach(void *upstream, struct sfp_bus *bus); int phy_sfp_probe(struct phy_device *phydev, -- cgit v1.2.3 From dedd702a35793ab462fce4c737eeba0badf9718e Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 21 Dec 2023 19:00:37 +0100 Subject: net: sfp: Add helper to return the SFP bus name Knowing the bus name is helpful when we want to expose the link topology to userspace, add a helper to return the SFP bus name. Signed-off-by: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/sfp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 0573e53b0c11f..55c0ab17c9e2e 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -570,6 +570,7 @@ struct sfp_bus *sfp_bus_find_fwnode(const struct fwnode_handle *fwnode); int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, const struct sfp_upstream_ops *ops); void sfp_bus_del_upstream(struct sfp_bus *bus); +const char *sfp_get_name(struct sfp_bus *bus); #else static inline int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, @@ -648,6 +649,11 @@ static inline int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, static inline void sfp_bus_del_upstream(struct sfp_bus *bus) { } + +static inline const char *sfp_get_name(struct sfp_bus *bus) +{ + return NULL; +} #endif #endif -- cgit v1.2.3 From 5a0e241003b80247de59727c945bc94c848f893d Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 29 Nov 2023 09:43:28 -0300 Subject: thermal/core: Prepare for introduction of thermal reboot Add some helper functions to make it easier introducing the support for thermal reboot. No functional change. Signed-off-by: Fabio Estevam Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20231129124330.519423-2-festevam@gmail.com --- include/linux/reboot.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index c4cc3b89ced1f..4586c663884ee 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -177,7 +177,12 @@ void ctrl_alt_del(void); extern void orderly_poweroff(bool force); extern void orderly_reboot(void); -void hw_protection_shutdown(const char *reason, int ms_until_forced); +void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown); + +static inline void hw_protection_shutdown(const char *reason, int ms_until_forced) +{ + __hw_protection_shutdown(reason, ms_until_forced, true); +} /* * Emergency restart, callable from an interrupt handler. -- cgit v1.2.3 From 79fa723ba84c2b1b3124c72df8a3b07b851a5477 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 29 Nov 2023 09:43:29 -0300 Subject: reboot: Introduce thermal_zone_device_critical_reboot() Introduce thermal_zone_device_critical_reboot() to trigger an emergency reboot. It is a counterpart of thermal_zone_device_critical() with the difference that it will force a reboot instead of shutdown. The motivation for doing this is to allow the thermal subystem to trigger a reboot when the temperature reaches the critical temperature. Signed-off-by: Fabio Estevam Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20231129124330.519423-3-festevam@gmail.com --- include/linux/reboot.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 4586c663884ee..abcdde4df6979 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -179,6 +179,11 @@ extern void orderly_poweroff(bool force); extern void orderly_reboot(void); void __hw_protection_shutdown(const char *reason, int ms_until_forced, bool shutdown); +static inline void hw_protection_reboot(const char *reason, int ms_until_forced) +{ + __hw_protection_shutdown(reason, ms_until_forced, false); +} + static inline void hw_protection_shutdown(const char *reason, int ms_until_forced) { __hw_protection_shutdown(reason, ms_until_forced, true); -- cgit v1.2.3 From 37d158d0b05144f696323ae5bbfe1e137f7c06d3 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 20 Dec 2023 08:38:46 +0100 Subject: HID: make hid_bus_type const Now that the driver core can properly handle constant struct bus_type, move the hid_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Jiri Kosina Cc: Benjamin Tissoires Cc: linux-input@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Jiri Kosina --- include/linux/hid.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index bf43f3ff66640..7c26db874ff03 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -912,7 +912,7 @@ extern bool hid_ignore(struct hid_device *); extern int hid_add_device(struct hid_device *); extern void hid_destroy_device(struct hid_device *); -extern struct bus_type hid_bus_type; +extern const struct bus_type hid_bus_type; extern int __must_check __hid_register_driver(struct hid_driver *, struct module *, const char *mod_name); -- cgit v1.2.3 From 9b0a3839e8d29663cd9ee2c43d38b06c3b91619e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 20 Dec 2023 08:38:48 +0100 Subject: HID: bpf: make bus_type const in struct hid_bpf_ops The struct bus_type pointer in hid_bpf_ops just passes the pointer to the driver core, and the driver core can handle, and expects, a constant pointer, so also make the pointer constant in hid_bpf_ops. Part of the process of moving all usages of struct bus_type to be constant to move them all to read-only memory. Cc: Jiri Kosina Cc: Benjamin Tissoires Cc: linux-input@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Jiri Kosina --- include/linux/hid_bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index e9afb61e6ee01..840cd254172d0 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -115,7 +115,7 @@ struct hid_bpf_ops { size_t len, enum hid_report_type rtype, enum hid_class_request reqtype); struct module *owner; - struct bus_type *bus_type; + const struct bus_type *bus_type; }; extern struct hid_bpf_ops *hid_bpf_ops; -- cgit v1.2.3 From 87d8f1ee1d4070571095859d310f6951440f288c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 23 Dec 2023 11:43:19 -0800 Subject: platform/x86: wmi: linux/wmi.h: fix Excess kernel-doc description warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the "private:" comment to prevent the kernel-doc warning: include/linux/wmi.h:27: warning: Excess struct member 'setable' description in 'wmi_device' Either a struct member is documented (via kernel-doc) or it's private, but not both. Fixes: b4cc979588ee ("platform/x86: wmi: Add kernel doc comments") Signed-off-by: Randy Dunlap Cc: Armin Wolf Cc: Hans de Goede Cc: Ilpo Järvinen Cc: platform-driver-x86@vger.kernel.org Reviewed-by: Armin Wolf Link: https://lore.kernel.org/r/20231223194321.23084-1-rdunlap@infradead.org Signed-off-by: Hans de Goede --- include/linux/wmi.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 50f7f1e4fd4f8..686291b878525 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -21,8 +21,6 @@ */ struct wmi_device { struct device dev; - - /* private: used by the WMI driver core */ bool setable; }; -- cgit v1.2.3 From 993498e537af9260e697219ce41b41b22b6199cc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Dec 2023 14:07:47 +0000 Subject: net-device: move gso_partial_features to net_device_read_tx dev->gso_partial_features is read from tx fast path for GSO packets. Move it to appropriate section to avoid a cache line miss. Fixes: 43a71cd66b9c ("net-device: reorganize net_device fast path variables") Signed-off-by: Eric Dumazet Cc: Coco Li Cc: David Ahern Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5baa5517f5330..d59db9adcc96e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2115,6 +2115,7 @@ struct net_device { const struct net_device_ops *netdev_ops; const struct header_ops *header_ops; struct netdev_queue *_tx; + netdev_features_t gso_partial_features; unsigned int real_num_tx_queues; unsigned int gso_max_size; unsigned int gso_ipv4_max_size; @@ -2211,7 +2212,6 @@ struct net_device { netdev_features_t vlan_features; netdev_features_t hw_enc_features; netdev_features_t mpls_features; - netdev_features_t gso_partial_features; unsigned int min_mtu; unsigned int max_mtu; -- cgit v1.2.3 From 059d37b718d38d26087121c754691df77acfc66b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 22 Dec 2023 21:06:13 -0800 Subject: net: phy: linux/phy.h: fix Excess kernel-doc description warning Remove the @phy_timer: line to prevent the kernel-doc warning: include/linux/phy.h:768: warning: Excess struct member 'phy_timer' description in 'phy_device' Signed-off-by: Randy Dunlap Cc: Andrew Lunn Cc: Heiner Kallweit Cc: Russell King Cc: netdev@vger.kernel.org Reviewed-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/phy.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 3cc52826f18e9..bd285950972c4 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -568,7 +568,6 @@ struct macsec_ops; * - Bits [31:24] are reserved for defining generic * PHY driver behavior. * @irq: IRQ number of the PHY's interrupt (-1 if none) - * @phy_timer: The timer for handling the state machine * @phylink: Pointer to phylink instance for this PHY * @sfp_bus_attached: Flag indicating whether the SFP bus has been attached * @sfp_bus: SFP bus attached to this PHY's fiber port -- cgit v1.2.3 From e4df56ad0bf3506c5189abb9be83f3bea05a4c4f Mon Sep 17 00:00:00 2001 From: Lin Gui Date: Tue, 19 Dec 2023 07:05:32 +0800 Subject: mmc: core: Add wp_grp_size sysfs node The eMMC card can be set into write-protected mode to prevent data from being accidentally modified or deleted. Wp_grp_size (Write Protect Group Size) refers to an attribute of the eMMC card, used to manage write protection and is the CSD register [36:32] of the eMMC device. Wp_grp_size (Write Protect Group Size) indicates how many eMMC blocks are contained in each write protection group on the eMMC card. To allow userspace easy access of the CSD register bits, let's add sysfs node "wp_grp_size". Signed-off-by: Lin Gui Signed-off-by: Bo Ye Reviewed-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20231218230532.82427-1-bo.ye@mediatek.com Signed-off-by: Ulf Hansson --- include/linux/mmc/card.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 47eeb122524c4..f34407cc27888 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -32,6 +32,7 @@ struct mmc_csd { unsigned int r2w_factor; unsigned int max_dtr; unsigned int erase_size; /* In sectors */ + unsigned int wp_grp_size; unsigned int read_blkbits; unsigned int write_blkbits; unsigned int capacity; @@ -303,6 +304,7 @@ struct mmc_card { unsigned int eg_boundary; /* don't cross erase-group boundaries */ unsigned int erase_arg; /* erase / trim / discard */ u8 erased_byte; /* value of erased bytes */ + unsigned int wp_grp_size; /* write group size in sectors */ u32 raw_cid[4]; /* raw card CID */ u32 raw_csd[4]; /* raw card CSD */ -- cgit v1.2.3 From 3e64db35bc37edbe9e37aaa987df92cde12ddb6c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 2 Jan 2024 14:23:34 -0800 Subject: Revert "net: mdio: get/put device node during (un)registration" This reverts commit cff9c565e65f3622e8dc1dcc21c1520a083dff35. Revert based on feedback from Russell. Link: https://lore.kernel.org/all/ZZPtUIRerqTI2%2Fyh@shell.armlinux.org.uk/ Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index ac22b8e28a853..6cb9d843aee93 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -434,9 +434,6 @@ struct mii_bus { /** @shared: shared state across different PHYs */ struct phy_package_shared *shared[PHY_MAX_ADDR]; - - /** @__unregister_callback: called at the last step of unregistration */ - void (*__unregister_callback)(struct mii_bus *bus); }; #define to_mii_bus(d) container_of(d, struct mii_bus, dev) -- cgit v1.2.3 From db02e176f597a14eb696141ffa008c2429453a15 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 6 Dec 2023 16:42:31 -0600 Subject: PCI/AER: Use explicit register sizes for struct members aer_irq() reads the AER Root Error Status and Error Source Identification (PCI_ERR_ROOT_STATUS and PCI_ERR_ROOT_ERR_SRC) registers directly into struct aer_err_source. Both registers are 32 bits, so declare the members explicitly as "u32" instead of "unsigned int". Similarly, aer_get_device_error_info() reads the AER Header Log (PCI_ERR_HEADER_LOG) registers, which are also 32 bits, into struct aer_header_log_regs. Declare those members as "u32" as well. No functional changes intended. Link: https://lore.kernel.org/r/20231206224231.732765-4-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron --- include/linux/aer.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/aer.h b/include/linux/aer.h index f6ea2f57d8089..ae0fae70d4bd2 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -19,10 +19,10 @@ struct pci_dev; struct aer_header_log_regs { - unsigned int dw0; - unsigned int dw1; - unsigned int dw2; - unsigned int dw3; + u32 dw0; + u32 dw1; + u32 dw2; + u32 dw3; }; struct aer_capability_regs { -- cgit v1.2.3 From 7d4b5d7a37bdd63a5a3371b988744b060d5bb86f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 27 Dec 2023 21:38:23 +0100 Subject: async: Introduce async_schedule_dev_nocall() In preparation for subsequent changes, introduce a specialized variant of async_schedule_dev() that will not invoke the argument function synchronously when it cannot be scheduled for asynchronous execution. The new function, async_schedule_dev_nocall(), will be used for fixing possible deadlocks in the system-wide power management core code. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka for the series. Tested-by: Youngmin Nam Reviewed-by: Ulf Hansson --- include/linux/async.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/async.h b/include/linux/async.h index cce4ad31e8fcf..33c9ff4afb492 100644 --- a/include/linux/async.h +++ b/include/linux/async.h @@ -90,6 +90,8 @@ async_schedule_dev(async_func_t func, struct device *dev) return async_schedule_node(func, dev, dev_to_node(dev)); } +bool async_schedule_dev_nocall(async_func_t func, struct device *dev); + /** * async_schedule_dev_domain - A device specific version of async_schedule_domain * @func: function to execute asynchronously -- cgit v1.2.3 From 3b82024c5ba93e7a0db2d0b9635ca6b28338efd7 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 28 Dec 2023 13:04:41 +0530 Subject: OPP: Move dev_pm_opp_icc_bw to internal opp.h It isn't used by any driver or API, privatize it. Signed-off-by: Viresh Kumar --- include/linux/pm_opp.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 74768c47d7904..76dcb7f37bcdf 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -45,18 +45,6 @@ struct dev_pm_opp_supply { unsigned long u_watt; }; -/** - * struct dev_pm_opp_icc_bw - Interconnect bandwidth values - * @avg: Average bandwidth corresponding to this OPP (in icc units) - * @peak: Peak bandwidth corresponding to this OPP (in icc units) - * - * This structure stores the bandwidth values for a single interconnect path. - */ -struct dev_pm_opp_icc_bw { - u32 avg; - u32 peak; -}; - typedef int (*config_regulators_t)(struct device *dev, struct dev_pm_opp *old_opp, struct dev_pm_opp *new_opp, struct regulator **regulators, unsigned int count); -- cgit v1.2.3 From a15f2d48c6f84ae0dd2000288592c79d5d1acd0e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 19 Dec 2023 16:47:41 +0100 Subject: nubus: Make nubus_bus_type static and constant Now that the driver core can properly handle constant struct bus_type, move the nubus_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. It's also never used outside of drivers/nubus/bus.c so make it static and don't export it as no one is using it. Signed-off-by: Greg Kroah-Hartman Acked-by: Finn Thain Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/2023121940-enlarged-editor-c9a8@gregkh Signed-off-by: Geert Uytterhoeven --- include/linux/nubus.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nubus.h b/include/linux/nubus.h index bdcd85e622d80..4d103ac8f5c7a 100644 --- a/include/linux/nubus.h +++ b/include/linux/nubus.h @@ -89,8 +89,6 @@ struct nubus_driver { void (*remove)(struct nubus_board *board); }; -extern struct bus_type nubus_bus_type; - /* Generic NuBus interface functions, modelled after the PCI interface */ #ifdef CONFIG_PROC_FS extern bool nubus_populate_procfs; -- cgit v1.2.3 From 2ad28ce9b98f8b22feaecc0966c706a8ef59cbf0 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 1 Jan 2024 12:35:27 +0200 Subject: nvme: remove unused definition There is no users for NVMF_AUTH_HASH_LEN macro. Reviewed-by: Israel Rukshin Reviewed-by: Sagi Grimberg Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- include/linux/nvme.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 44325c068b6a0..462c21e0e4176 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -20,7 +20,6 @@ #define NVMF_TRSVCID_SIZE 32 #define NVMF_TRADDR_SIZE 256 #define NVMF_TSAS_SIZE 256 -#define NVMF_AUTH_HASH_LEN 64 #define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery" -- cgit v1.2.3 From 7865dfb1eb941ddd25802a9e13b6ff5f3f4dc02f Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 21 Dec 2023 15:23:24 -0800 Subject: bpf: sockmap, added comments describing update proto rules Add a comment describing that the psock update proto callbback can be called multiple times and this must be safe. Signed-off-by: John Fastabend Signed-off-by: Martin KaFai Lau Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20231221232327.43678-3-john.fastabend@gmail.com --- include/linux/skmsg.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index c953b8c0d2f43..888a4b217829f 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -100,6 +100,11 @@ struct sk_psock { void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); + /* psock_update_sk_prot may be called with restore=false many times + * so the handler must be safe for this case. It will be called + * exactly once with restore=true when the psock is being destroyed + * and psock refcnt is zero, but before an RCU grace period. + */ int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, bool restore); struct proto *sk_proto; -- cgit v1.2.3 From d3d344a1ca69d8fb2413e29e6400f3ad58a05c06 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Jan 2024 16:22:20 +0000 Subject: net-device: move xdp_prog to net_device_read_rx xdp_prog is used in receive path, both from XDP enabled drivers and from netif_elide_gro(). This patch also removes two 4-bytes holes. Fixes: 43a71cd66b9c ("net-device: reorganize net_device fast path variables") Signed-off-by: Eric Dumazet Cc: Coco Li Cc: Simon Horman Link: https://lore.kernel.org/r/20240102162220.750823-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d59db9adcc96e..e265aa1f21699 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2150,6 +2150,7 @@ struct net_device { /* RX read-mostly hotpath */ __cacheline_group_begin(net_device_read_rx); + struct bpf_prog __rcu *xdp_prog; struct list_head ptype_specific; int ifindex; unsigned int real_num_rx_queues; @@ -2325,7 +2326,6 @@ struct net_device { const unsigned char *dev_addr; unsigned int num_rx_queues; - struct bpf_prog __rcu *xdp_prog; #define GRO_LEGACY_MAX_SIZE 65536u /* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), * and shinfo->gso_segs is a 16bit field. -- cgit v1.2.3 From 9fc8e802048ad150e8032c4f3dbf40112160cfe9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Dec 2023 19:17:39 -0800 Subject: bpf: Add objcg to bpf_mem_alloc The objcg is a bpf_mem_alloc level property since all bpf_mem_cache's are with the same objcg. This patch made such a property explicit. The next patch will use this property to save and restore objcg for percpu unit allocator. Acked-by: Hou Tao Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20231222031739.1288590-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index bb1223b213087..acef8c8085996 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -11,6 +11,7 @@ struct bpf_mem_caches; struct bpf_mem_alloc { struct bpf_mem_caches __percpu *caches; struct bpf_mem_cache __percpu *cache; + struct obj_cgroup *objcg; bool percpu; struct work_struct work; }; -- cgit v1.2.3 From c39aa3b289e9c10d0d246cd919b06809f13b72b8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Dec 2023 19:17:45 -0800 Subject: bpf: Allow per unit prefill for non-fix-size percpu memory allocator Commit 41a5db8d8161 ("Add support for non-fix-size percpu mem allocation") added support for non-fix-size percpu memory allocation. Such allocation will allocate percpu memory for all buckets on all cpus and the memory consumption is in the order to quadratic. For example, let us say, 4 cpus, unit size 16 bytes, so each cpu has 16 * 4 = 64 bytes, with 4 cpus, total will be 64 * 4 = 256 bytes. Then let us say, 8 cpus with the same unit size, each cpu has 16 * 8 = 128 bytes, with 8 cpus, total will be 128 * 8 = 1024 bytes. So if the number of cpus doubles, the number of memory consumption will be 4 times. So for a system with large number of cpus, the memory consumption goes up quickly with quadratic order. For example, for 4KB percpu allocation, 128 cpus. The total memory consumption will 4KB * 128 * 128 = 64MB. Things will become worse if the number of cpus is bigger (e.g., 512, 1024, etc.) In Commit 41a5db8d8161, the non-fix-size percpu memory allocation is done in boot time, so for system with large number of cpus, the initial percpu memory consumption is very visible. For example, for 128 cpu system, the total percpu memory allocation will be at least (16 + 32 + 64 + 96 + 128 + 196 + 256 + 512 + 1024 + 2048 + 4096) * 128 * 128 = ~138MB. which is pretty big. It will be even bigger for larger number of cpus. Note that the current prefill also allocates 4 entries if the unit size is less than 256. So on top of 138MB memory consumption, this will add more consumption with 3 * (16 + 32 + 64 + 96 + 128 + 196 + 256) * 128 * 128 = ~38MB. Next patch will try to reduce this memory consumption. Later on, Commit 1fda5bb66ad8 ("bpf: Do not allocate percpu memory at init stage") moved the non-fix-size percpu memory allocation to bpf verificaiton stage. Once a particular bpf_percpu_obj_new() is called by bpf program, the memory allocator will try to fill in the cache with all sizes, causing the same amount of percpu memory consumption as in the boot stage. To reduce the initial percpu memory consumption for non-fix-size percpu memory allocation, instead of filling the cache with all supported allocation sizes, this patch intends to fill the cache only for the requested size. As typically users will not use large percpu data structure, this can save memory significantly. For example, the allocation size is 64 bytes with 128 cpus. Then total percpu memory amount will be 64 * 128 * 128 = 1MB, much less than previous 138MB. Signed-off-by: Yonghong Song Acked-by: Hou Tao Link: https://lore.kernel.org/r/20231222031745.1289082-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index acef8c8085996..aaf004d943228 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -22,8 +22,15 @@ struct bpf_mem_alloc { * 'size = 0' is for bpf_mem_alloc which manages many fixed-size objects. * Alloc and free are done with bpf_mem_{alloc,free}() and the size of * the returned object is given by the size argument of bpf_mem_alloc(). + * If percpu equals true, error will be returned in order to avoid + * large memory consumption and the below bpf_mem_alloc_percpu_unit_init() + * should be used to do on-demand per-cpu allocation for each size. */ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); +/* Initialize a non-fix-size percpu memory allocator */ +int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg); +/* The percpu allocation with a specific unit size. */ +int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); /* kmalloc/kfree equivalent: */ -- cgit v1.2.3 From 5e5401d6612ef599ad45785b941eebda7effc90f Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 4 Jan 2024 09:47:36 +0000 Subject: net: phylink: move phylink_pcs_neg_mode() into phylink.c Move phylink_pcs_neg_mode() from the header file into the .c file since nothing should be using it. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phylink.h | 66 ------------------------------------------------- 1 file changed, 66 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 875439ab45dec..d589f89c612c6 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -98,72 +98,6 @@ static inline bool phylink_autoneg_inband(unsigned int mode) return mode == MLO_AN_INBAND; } -/** - * phylink_pcs_neg_mode() - helper to determine PCS inband mode - * @mode: one of %MLO_AN_FIXED, %MLO_AN_PHY, %MLO_AN_INBAND. - * @interface: interface mode to be used - * @advertising: adertisement ethtool link mode mask - * - * Determines the negotiation mode to be used by the PCS, and returns - * one of: - * - * - %PHYLINK_PCS_NEG_NONE: interface mode does not support inband - * - %PHYLINK_PCS_NEG_OUTBAND: an out of band mode (e.g. reading the PHY) - * will be used. - * - %PHYLINK_PCS_NEG_INBAND_DISABLED: inband mode selected but autoneg - * disabled - * - %PHYLINK_PCS_NEG_INBAND_ENABLED: inband mode selected and autoneg enabled - * - * Note: this is for cases where the PCS itself is involved in negotiation - * (e.g. Clause 37, SGMII and similar) not Clause 73. - */ -static inline unsigned int phylink_pcs_neg_mode(unsigned int mode, - phy_interface_t interface, - const unsigned long *advertising) -{ - unsigned int neg_mode; - - switch (interface) { - case PHY_INTERFACE_MODE_SGMII: - case PHY_INTERFACE_MODE_QSGMII: - case PHY_INTERFACE_MODE_QUSGMII: - case PHY_INTERFACE_MODE_USXGMII: - /* These protocols are designed for use with a PHY which - * communicates its negotiation result back to the MAC via - * inband communication. Note: there exist PHYs that run - * with SGMII but do not send the inband data. - */ - if (!phylink_autoneg_inband(mode)) - neg_mode = PHYLINK_PCS_NEG_OUTBAND; - else - neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED; - break; - - case PHY_INTERFACE_MODE_1000BASEX: - case PHY_INTERFACE_MODE_2500BASEX: - /* 1000base-X is designed for use media-side for Fibre - * connections, and thus the Autoneg bit needs to be - * taken into account. We also do this for 2500base-X - * as well, but drivers may not support this, so may - * need to override this. - */ - if (!phylink_autoneg_inband(mode)) - neg_mode = PHYLINK_PCS_NEG_OUTBAND; - else if (linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, - advertising)) - neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED; - else - neg_mode = PHYLINK_PCS_NEG_INBAND_DISABLED; - break; - - default: - neg_mode = PHYLINK_PCS_NEG_NONE; - break; - } - - return neg_mode; -} - /** * struct phylink_link_state - link state structure * @advertising: ethtool bitmask containing advertised link modes -- cgit v1.2.3 From 040a82be54c09a72162a3db2f5cd2ba289c0f224 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Oct 2023 12:26:31 +0100 Subject: netfs: Rearrange netfs_io_subrequest to put request pointer first Rearrange the netfs_io_subrequest struct to put the netfs_io_request pointer (rreq) first. This then allows netfs_io_subrequest to be put in a union with a pointer to a wrapper around netfs_io_request. This will be useful in the future for cifs and maybe ceph. Signed-off-by: David Howells cc: Steve French cc: Shyam Prasad N cc: Rohith Surabattula cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/netfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 852956aa3c4bb..d3bac60fcd6f3 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -204,8 +204,8 @@ struct netfs_cache_resources { * the pages it points to can be relied on to exist for the duration. */ struct netfs_io_subrequest { - struct work_struct work; struct netfs_io_request *rreq; /* Supervising I/O request */ + struct work_struct work; struct list_head rreq_link; /* Link in rreq->subrequests */ struct iov_iter io_iter; /* Iterator for this subrequest */ loff_t start; /* Where to start the I/O */ -- cgit v1.2.3 From f36be9ce8146faabdbbf74ee0499edb2039c53a5 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 19 Dec 2023 14:13:10 +0100 Subject: EDAC: constantify the struct bus_type usage In many places in the edac code, struct bus_type pointers are passed around and then eventually sent to the driver core, which can handle a constant pointer. So constantify all of the edac usage of these as well because the data in them is never modified by the edac code either. Cc: Borislav Petkov Cc: Tony Luck Cc: James Morse Cc: Mauro Carvalho Chehab Cc: Robert Richter Cc: Link: https://lore.kernel.org/r/2023121909-tribute-punctuate-4b22@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/edac.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/edac.h b/include/linux/edac.h index fa4bda2a70f6c..ccaf2ae0801d6 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -30,7 +30,7 @@ struct device; extern int edac_op_state; -struct bus_type *edac_get_sysfs_subsys(void); +const struct bus_type *edac_get_sysfs_subsys(void); static inline void opstate_init(void) { @@ -492,7 +492,7 @@ struct edac_raw_error_desc { */ struct mem_ctl_info { struct device dev; - struct bus_type *bus; + const struct bus_type *bus; struct list_head link; /* for global list of mem_ctl_info structs */ -- cgit v1.2.3 From db2292b01b799e926abfdbd6fafa1f27f0d0e457 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 19 Dec 2023 15:07:23 +0100 Subject: PM: clk: make pm_clk_add_notifier() take a const pointer The driver core wants to work with const struct bus_type, so there's no reason that pm_clk_add_notifier() should not also do the same thing, considering that it just passes the pointer off to the driver core which is expecting a const *. Cc: Rafael J. Wysocki Link: https://lore.kernel.org/r/2023121922-triumph-exploit-f545@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/pm_clock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_clock.h b/include/linux/pm_clock.h index ada3a0ab10bf2..68669ce187204 100644 --- a/include/linux/pm_clock.h +++ b/include/linux/pm_clock.h @@ -91,10 +91,10 @@ static inline int devm_pm_clk_create(struct device *dev) #endif #ifdef CONFIG_HAVE_CLK -extern void pm_clk_add_notifier(struct bus_type *bus, +extern void pm_clk_add_notifier(const struct bus_type *bus, struct pm_clk_notifier_block *clknb); #else -static inline void pm_clk_add_notifier(struct bus_type *bus, +static inline void pm_clk_add_notifier(const struct bus_type *bus, struct pm_clk_notifier_block *clknb) { } -- cgit v1.2.3 From e76933a9bfa9b7f28a387f2e13cb3e689adc200d Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 19 Dec 2023 15:06:19 +0100 Subject: maple: make maple_bus_type static and const There is no need to export maple_bus_type as no one uses it outside of maple.c, so make it static, AND make it const as it can be read-only as no one modifies it. Cc: Yoshinori Sato Cc: Rich Felker Cc: John Paul Adrian Glaubitz Cc: Link: https://lore.kernel.org/r/2023121918-rejoicing-frostlike-d976@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/maple.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/maple.h b/include/linux/maple.h index 9b140272ee165..9aae44efcfd4c 100644 --- a/include/linux/maple.h +++ b/include/linux/maple.h @@ -5,7 +5,6 @@ #include struct device; -extern struct bus_type maple_bus_type; /* Maple Bus command and response codes */ enum maple_code { -- cgit v1.2.3 From 86438841e48f6361f0a6a04805b7d7813738761f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 19 Dec 2023 14:41:42 +0100 Subject: dma-debug: make dma_debug_add_bus take a const pointer The driver core now can handle a const struct bus_type pointer, and the dma_debug_add_bus() call just passes on the pointer give to it to the driver core, so make this pointer const as well to allow everyone to use read-only struct bus_type pointers going forward. Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: Reviewed-by: Robin Murphy Link: https://lore.kernel.org/r/2023121941-dejected-nugget-681e@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/dma-map-ops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index f2fc203fb8a1a..e401f824a007f 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -443,10 +443,10 @@ static inline void arch_teardown_dma_ops(struct device *dev) #endif /* CONFIG_ARCH_HAS_TEARDOWN_DMA_OPS */ #ifdef CONFIG_DMA_API_DEBUG -void dma_debug_add_bus(struct bus_type *bus); +void dma_debug_add_bus(const struct bus_type *bus); void debug_dma_dump_mappings(struct device *dev); #else -static inline void dma_debug_add_bus(struct bus_type *bus) +static inline void dma_debug_add_bus(const struct bus_type *bus) { } static inline void debug_dma_dump_mappings(struct device *dev) -- cgit v1.2.3 From b5b0774d53bb81bddbf8c609b3f183d4af6e91da Mon Sep 17 00:00:00 2001 From: Basavaraj Natikar Date: Wed, 20 Dec 2023 12:30:41 +0530 Subject: HID: amd_sfh: Add a new interface for exporting HPD data AMDSFH has information about the User presence information via the Human Presence Detection (HPD) sensor which is part of the AMD sensor fusion hub. Add a new interface to export this information, where other drivers like PMF can use this information to enhance user experiences. Link: https://lore.kernel.org/all/ad064333-48a4-4cfa-9428-69e8a7c44667@redhat.com/ Co-developed-by: Shyam Sundar S K Signed-off-by: Shyam Sundar S K Signed-off-by: Basavaraj Natikar Signed-off-by: Jiri Kosina --- include/linux/amd-pmf-io.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 include/linux/amd-pmf-io.h (limited to 'include/linux') diff --git a/include/linux/amd-pmf-io.h b/include/linux/amd-pmf-io.h new file mode 100644 index 0000000000000..5b6d29d369221 --- /dev/null +++ b/include/linux/amd-pmf-io.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AMD Platform Management Framework Interface + * + * Copyright (c) 2023, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Authors: Shyam Sundar S K + * Basavaraj Natikar + */ + +#ifndef AMD_PMF_IO_H +#define AMD_PMF_IO_H + +#include + +/** + * enum sfh_message_type - Query the SFH message type + * @MT_HPD: Message ID to know the Human presence info from MP2 FW + */ +enum sfh_message_type { + MT_HPD, +}; + +/** + * enum sfh_hpd_info - Query the Human presence information + * @SFH_NOT_DETECTED: Check the HPD connection information from MP2 FW + * @SFH_USER_PRESENT: Check if the user is present from HPD sensor + * @SFH_USER_AWAY: Check if the user is away from HPD sensor + */ +enum sfh_hpd_info { + SFH_NOT_DETECTED, + SFH_USER_PRESENT, + SFH_USER_AWAY, +}; + +/** + * struct amd_sfh_info - get HPD sensor info from MP2 FW + * @user_present: Populates the user presence information + */ +struct amd_sfh_info { + u8 user_present; +}; + +int amd_get_sfh_info(struct amd_sfh_info *sfh_info, enum sfh_message_type op); +#endif -- cgit v1.2.3 From 584f35a3647d42980af495fc0bc5c51eb174aa35 Mon Sep 17 00:00:00 2001 From: Basavaraj Natikar Date: Wed, 20 Dec 2023 12:30:42 +0530 Subject: HID: amd_sfh: Add a new interface for exporting ALS data AMDSFH has information about the Ambient light via the Ambient Light Sensor (ALS) which is part of the AMD sensor fusion hub. Add a new interface to export this information, where other drivers like PMF can use this information to enhance user experiences. Link: https://lore.kernel.org/all/ad064333-48a4-4cfa-9428-69e8a7c44667@redhat.com/ Reviewed-by: Mario Limonciello Co-developed-by: Shyam Sundar S K Signed-off-by: Shyam Sundar S K Signed-off-by: Basavaraj Natikar Signed-off-by: Jiri Kosina --- include/linux/amd-pmf-io.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/amd-pmf-io.h b/include/linux/amd-pmf-io.h index 5b6d29d369221..b4f8182052169 100644 --- a/include/linux/amd-pmf-io.h +++ b/include/linux/amd-pmf-io.h @@ -17,9 +17,11 @@ /** * enum sfh_message_type - Query the SFH message type * @MT_HPD: Message ID to know the Human presence info from MP2 FW + * @MT_ALS: Message ID to know the Ambient light info from MP2 FW */ enum sfh_message_type { MT_HPD, + MT_ALS, }; /** @@ -36,9 +38,11 @@ enum sfh_hpd_info { /** * struct amd_sfh_info - get HPD sensor info from MP2 FW + * @ambient_light: Populates the ambient light information * @user_present: Populates the user presence information */ struct amd_sfh_info { + u32 ambient_light; u8 user_present; }; -- cgit v1.2.3 From 398aa9a7e77cf23c2a6f882ddd3dcd96f21771dc Mon Sep 17 00:00:00 2001 From: Manan Aurora Date: Tue, 31 Oct 2023 03:46:41 +0000 Subject: usb: dwc3: Support EBC feature of DWC_usb31 Support configuration and use of bulk endpoints in the so-called EBC mode described in the DBC_usb31 databook (appendix E) Added a bit fifo_mode to usb_ep to indicate to the UDC driver that a specific endpoint is to operate in the EBC (or equivalent) mode when enabled Added macros for bits 15 and 14 of DEPCFG parameter 1 to indicate EBC mode and write back behaviour. These bits will be set to 1 when configuring an EBC endpoint as described in the programming guide Signed-off-by: Manan Aurora Link: https://lore.kernel.org/r/20231031034641.660606-1-maurora@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/gadget.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index 6532beb587b19..a771ccc038ac9 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -236,6 +236,7 @@ struct usb_ep { unsigned max_streams:16; unsigned mult:2; unsigned maxburst:5; + unsigned fifo_mode:1; u8 address; const struct usb_endpoint_descriptor *desc; const struct usb_ss_ep_comp_descriptor *comp_desc; -- cgit v1.2.3 From 49a78b05d5ca1e23fd737747a8757b8bdc319b30 Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Thu, 4 Jan 2024 11:28:22 +0800 Subject: USB: core: Use device_driver directly in struct usb_driver and usb_device_driver There is usbdrv_wrap in struct usb_driver and usb_device_driver, it contains device_driver and for_devices. for_devices is used to distinguish between device drivers and interface drivers. Like the is_usb_device(), it tests the type of the device. We can test that if the probe of device_driver is equal to usb_probe_device in is_usb_device_driver(), and then the struct usbdrv_wrap is no longer needed. Clean up struct usbdrv_wrap, use device_driver directly in struct usb_driver and usb_device_driver. This makes the code cleaner. Signed-off-by: Yajun Deng Acked-by: Alan Stern Link: https://lore.kernel.org/r/20240104032822.1896596-1-yajun.deng@linux.dev Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 07556341ba2b4..9e52179872a50 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1143,16 +1143,6 @@ extern ssize_t usb_store_new_id(struct usb_dynids *dynids, extern ssize_t usb_show_dynids(struct usb_dynids *dynids, char *buf); -/** - * struct usbdrv_wrap - wrapper for driver-model structure - * @driver: The driver-model core driver structure. - * @for_devices: Non-zero for device drivers, 0 for interface drivers. - */ -struct usbdrv_wrap { - struct device_driver driver; - int for_devices; -}; - /** * struct usb_driver - identifies USB interface driver to usbcore * @name: The driver name should be unique among USB drivers, @@ -1193,7 +1183,7 @@ struct usbdrv_wrap { * is bound to the driver. * @dynids: used internally to hold the list of dynamically added device * ids for this driver. - * @drvwrap: Driver-model core structure wrapper. + * @driver: The driver-model core driver structure. * @no_dynamic_id: if set to 1, the USB core will not allow dynamic ids to be * added to this driver by preventing the sysfs file from being created. * @supports_autosuspend: if set to 0, the USB core will not allow autosuspend @@ -1241,13 +1231,13 @@ struct usb_driver { const struct attribute_group **dev_groups; struct usb_dynids dynids; - struct usbdrv_wrap drvwrap; + struct device_driver driver; unsigned int no_dynamic_id:1; unsigned int supports_autosuspend:1; unsigned int disable_hub_initiated_lpm:1; unsigned int soft_unbind:1; }; -#define to_usb_driver(d) container_of(d, struct usb_driver, drvwrap.driver) +#define to_usb_driver(d) container_of(d, struct usb_driver, driver) /** * struct usb_device_driver - identifies USB device driver to usbcore @@ -1268,7 +1258,7 @@ struct usb_driver { * on to call the normal usb_choose_configuration(). * @dev_groups: Attributes attached to the device that will be created once it * is bound to the driver. - * @drvwrap: Driver-model core structure wrapper. + * @driver: The driver-model core driver structure. * @id_table: used with @match() to select better matching driver at * probe() time. * @supports_autosuspend: if set to 0, the USB core will not allow autosuspend @@ -1277,7 +1267,7 @@ struct usb_driver { * resume and suspend functions will be called in addition to the driver's * own, so this part of the setup does not need to be replicated. * - * USB drivers must provide all the fields listed above except drvwrap, + * USB drivers must provide all the fields listed above except driver, * match, and id_table. */ struct usb_device_driver { @@ -1293,13 +1283,13 @@ struct usb_device_driver { int (*choose_configuration) (struct usb_device *udev); const struct attribute_group **dev_groups; - struct usbdrv_wrap drvwrap; + struct device_driver driver; const struct usb_device_id *id_table; unsigned int supports_autosuspend:1; unsigned int generic_subclass:1; }; #define to_usb_device_driver(d) container_of(d, struct usb_device_driver, \ - drvwrap.driver) + driver) /** * struct usb_class_driver - identifies a USB driver that wants to use the USB major number -- cgit v1.2.3 From cb86a3383aa7b9bb891daca691e596f6bfe52d82 Mon Sep 17 00:00:00 2001 From: Vamshi Gajjela Date: Thu, 9 Nov 2023 12:04:16 +0530 Subject: serial: core: Update uart_poll_timeout() function to return unsigned long The function uart_fifo_timeout() returns an unsigned long value, which is the number of jiffies. Therefore, change the variable timeout in the function uart_poll_timeout() from int to unsigned long. Change the return type of the function uart_poll_timeout() from int to unsigned long to be consistent with the type of timeout values. Signed-off-by: Vamshi Gajjela Link: https://lore.kernel.org/r/20231109063417.3971005-2-vamshigajjela@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 89f7b6c63598c..536b2581d3e20 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -852,9 +852,9 @@ static inline unsigned long uart_fifo_timeout(struct uart_port *port) } /* Base timer interval for polling */ -static inline int uart_poll_timeout(struct uart_port *port) +static inline unsigned long uart_poll_timeout(struct uart_port *port) { - int timeout = uart_fifo_timeout(port); + unsigned long timeout = uart_fifo_timeout(port); return timeout > 6 ? (timeout / 2 - 2) : 1; } -- cgit v1.2.3 From 5f1e77b2285b47c216b44e513071549cf006a309 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Thu, 30 Nov 2023 13:30:15 -0500 Subject: SUNRPC: Remove unused function rpc_clnt_xprt_switch_put() Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/sunrpc/clnt.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index e9d4377d03c6e..5e9d1469c6fae 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -252,7 +252,6 @@ void rpc_clnt_probe_trunked_xprts(struct rpc_clnt *, const char *rpc_proc_name(const struct rpc_task *task); -void rpc_clnt_xprt_switch_put(struct rpc_clnt *); void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *); void rpc_clnt_xprt_switch_remove_xprt(struct rpc_clnt *, struct rpc_xprt *); bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, -- cgit v1.2.3 From 12fc0a963128b54b82e98b9909f463e784b90b07 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 15 Dec 2023 20:47:07 +0000 Subject: nfs: Remove writepage NFS already has writepages and migrate_folio, so it does not need to implement writepage. The writepage operation is deprecated as it leads to worse performance under high memory pressure due to folios being written out in LRU order rather than sequentially within a file. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Anna Schumaker --- include/linux/nfs_fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 279262057a925..f5ce7b1011461 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -595,7 +595,6 @@ extern void nfs_complete_unlink(struct dentry *dentry, struct inode *); * linux/fs/nfs/write.c */ extern int nfs_congestion_kb; -extern int nfs_writepage(struct page *page, struct writeback_control *wbc); extern int nfs_writepages(struct address_space *, struct writeback_control *); extern int nfs_flush_incompatible(struct file *file, struct folio *folio); extern int nfs_update_folio(struct file *file, struct folio *folio, -- cgit v1.2.3 From 401df0d4f4098ecc9c5278da2f50756d62e5b37d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Tue, 19 Dec 2023 13:01:03 +0100 Subject: nvmem: layouts: refactor .add_cells() callback arguments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simply pass whole "struct nvmem_layout" instead of single variables. There is nothing in "struct nvmem_layout" that we have to hide from layout drivers. They also access it during .probe() and .remove(). Thanks to this change: 1. API gets more consistent All layouts drivers callbacks get the same argument 2. Layouts get correct device Before this change NVMEM core code was passing NVMEM device instead of layout device. That resulted in: * Confusing prints * Calling devm_*() helpers on wrong device * Helpers like of_device_get_match_data() dereferencing NULLs 3. It gets possible to get match data First of all nvmem_layout_get_match_data() requires passing "struct nvmem_layout" which .add_cells() callback didn't have before this. It doesn't matter much as it's rather useless now anyway (and will be dropped). What's more important however is that of_device_get_match_data() can be used now thanks to owning a proper device pointer. Signed-off-by: Rafał Miłecki Reviewed-by: Miquel Raynal Reviewed-by: Michael Walle Link: https://lore.kernel.org/r/20231219120104.3422-1-zajec5@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/nvmem-provider.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index 6fe65b35ea972..81a67642ac553 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -173,7 +173,7 @@ struct nvmem_cell_table { struct nvmem_layout { struct device dev; struct nvmem_device *nvmem; - int (*add_cells)(struct device *dev, struct nvmem_device *nvmem); + int (*add_cells)(struct nvmem_layout *layout); }; struct nvmem_layout_driver { -- cgit v1.2.3 From 43f60e3fb62edc7bd8891de8779fb422f4ae23ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Tue, 19 Dec 2023 13:01:04 +0100 Subject: nvmem: drop nvmem_layout_get_match_data() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thanks for layouts refactoring we now have "struct device" associated with layout. Also its OF pointer points directly to the "nvmem-layout" DT node. All it takes to get match data is a generic of_device_get_match_data(). Signed-off-by: Rafał Miłecki Reviewed-by: Miquel Raynal Reviewed-by: Michael Walle Link: https://lore.kernel.org/r/20231219120104.3422-2-zajec5@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/nvmem-provider.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index 81a67642ac553..f0ba0e03218f9 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -205,9 +205,6 @@ void nvmem_layout_driver_unregister(struct nvmem_layout_driver *drv); module_driver(__nvmem_layout_driver, nvmem_layout_driver_register, \ nvmem_layout_driver_unregister) -const void *nvmem_layout_get_match_data(struct nvmem_device *nvmem, - struct nvmem_layout *layout); - #else static inline struct nvmem_device *nvmem_register(const struct nvmem_config *c) @@ -238,13 +235,6 @@ static inline int nvmem_layout_register(struct nvmem_layout *layout) static inline void nvmem_layout_unregister(struct nvmem_layout *layout) {} -static inline const void * -nvmem_layout_get_match_data(struct nvmem_device *nvmem, - struct nvmem_layout *layout) -{ - return NULL; -} - #endif /* CONFIG_NVMEM */ #if IS_ENABLED(CONFIG_NVMEM) && IS_ENABLED(CONFIG_OF) -- cgit v1.2.3 From 33cf42e68efc8ff529a7eee08a4f0ba8c8d0a207 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Thu, 21 Dec 2023 18:34:17 +0100 Subject: nvmem: core: add nvmem_dev_size() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is required by layouts that need to read whole NVMEM content. It's especially useful for NVMEM devices without hardcoded layout (like U-Boot environment data block). Signed-off-by: Rafał Miłecki Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/r/20231221173421.13737-2-zajec5@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/nvmem-consumer.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nvmem-consumer.h b/include/linux/nvmem-consumer.h index 2d306fa13b1a8..34c0e58dfa266 100644 --- a/include/linux/nvmem-consumer.h +++ b/include/linux/nvmem-consumer.h @@ -81,6 +81,7 @@ int nvmem_device_cell_write(struct nvmem_device *nvmem, struct nvmem_cell_info *info, void *buf); const char *nvmem_dev_name(struct nvmem_device *nvmem); +size_t nvmem_dev_size(struct nvmem_device *nvmem); void nvmem_add_cell_lookups(struct nvmem_cell_lookup *entries, size_t nentries); -- cgit v1.2.3 From aeda33ab8160c7a2e24ba4f44492ad1e974ddc7d Mon Sep 17 00:00:00 2001 From: Abhijit Gangurde Date: Fri, 22 Dec 2023 12:16:26 +0530 Subject: cdx: create sysfs bin files for cdx resources Resource binary file contains the content of the memory regions. These resources devices can be used to mmap the MMIO regions in the user-space. Co-developed-by: Puneet Gupta Signed-off-by: Puneet Gupta Signed-off-by: Abhijit Gangurde Link: https://lore.kernel.org/r/20231222064627.2828960-1-abhijit.gangurde@amd.com Signed-off-by: Greg Kroah-Hartman --- include/linux/cdx/cdx_bus.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h index 94ad2c9017c9d..fab9e62c7e7d3 100644 --- a/include/linux/cdx/cdx_bus.h +++ b/include/linux/cdx/cdx_bus.h @@ -135,6 +135,7 @@ struct cdx_device { u8 bus_num; u8 dev_num; struct resource res[MAX_CDX_DEV_RESOURCES]; + struct bin_attribute *res_attr[MAX_CDX_DEV_RESOURCES]; u8 res_count; u64 dma_mask; u16 flags; @@ -147,6 +148,15 @@ struct cdx_device { #define to_cdx_device(_dev) \ container_of(_dev, struct cdx_device, dev) +#define cdx_resource_start(dev, num) ((dev)->res[(num)].start) +#define cdx_resource_end(dev, num) ((dev)->res[(num)].end) +#define cdx_resource_flags(dev, num) ((dev)->res[(num)].flags) +#define cdx_resource_len(dev, num) \ + ((cdx_resource_start((dev), (num)) == 0 && \ + cdx_resource_end((dev), (num)) == \ + cdx_resource_start((dev), (num))) ? 0 : \ + (cdx_resource_end((dev), (num)) - \ + cdx_resource_start((dev), (num)) + 1)) /** * struct cdx_driver - CDX device driver * @driver: Generic device driver -- cgit v1.2.3 From cf60af04edfe51fca488246c9959904adb2750fa Mon Sep 17 00:00:00 2001 From: Abhijit Gangurde Date: Fri, 22 Dec 2023 12:16:27 +0530 Subject: cdx: Create resource debugfs file for cdx device resource debugfs file contains host addresses of CDX device resources. Each line of the resource file describe type of resource, a region with start-end and flag fields. Signed-off-by: Abhijit Gangurde Link: https://lore.kernel.org/r/20231222064627.2828960-2-abhijit.gangurde@amd.com Signed-off-by: Greg Kroah-Hartman --- include/linux/cdx/cdx_bus.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h index fab9e62c7e7d3..6355a36a3f815 100644 --- a/include/linux/cdx/cdx_bus.h +++ b/include/linux/cdx/cdx_bus.h @@ -113,6 +113,7 @@ struct cdx_controller { * @dev_num: Device number for this device * @res: array of MMIO region entries * @res_attr: resource binary attribute + * @debugfs_dir: debugfs directory for this device * @res_count: number of valid MMIO regions * @dma_mask: Default DMA mask * @flags: CDX device flags @@ -136,6 +137,7 @@ struct cdx_device { u8 dev_num; struct resource res[MAX_CDX_DEV_RESOURCES]; struct bin_attribute *res_attr[MAX_CDX_DEV_RESOURCES]; + struct dentry *debugfs_dir; u8 res_count; u64 dma_mask; u16 flags; -- cgit v1.2.3 From a87e55bfa25c195b3aaa25369175905ba9527fff Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 19 Dec 2023 15:26:38 +0100 Subject: moxtet: remove unused moxtet_type declaration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For some reason, moxtet_type was defined in moxtet.h, but never actually used. Looks like a left-over from the original commit that was exporting the moxtet bus type, but that wasn't needed, and it was a different variable name, so no one noticed this one dangling around. Cc: Marek Behún Link: https://lore.kernel.org/r/2023121937-pants-heroics-17c1@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/moxtet.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/moxtet.h b/include/linux/moxtet.h index 79184948fab47..ac577699edfdb 100644 --- a/include/linux/moxtet.h +++ b/include/linux/moxtet.h @@ -35,8 +35,6 @@ enum turris_mox_module_id { #define MOXTET_NIRQS 16 -extern struct bus_type moxtet_type; - struct moxtet { struct device *dev; struct mutex lock; -- cgit v1.2.3 From e1be24b2e1190a7662462e8e398189ac795339cd Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 19 Dec 2023 18:18:58 +0100 Subject: platform/surface: aggregator: make ssam_bus_type constant and static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that the driver core can properly handle constant struct bus_type, move the ssam_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. It's also never used outside of drivers/platform/surface/aggregator/bus.c so make it static and don't export it as no one is using it. Cc: Maximilian Luz Cc: Hans de Goede Cc: Ilpo Järvinen Cc: Reviewed-by: Maximilian Luz Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/2023121957-tapered-upswing-8326@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/surface_aggregator/device.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/surface_aggregator/device.h b/include/linux/surface_aggregator/device.h index 42b249b4c24b1..8cd8c38cf3f30 100644 --- a/include/linux/surface_aggregator/device.h +++ b/include/linux/surface_aggregator/device.h @@ -193,7 +193,6 @@ struct ssam_device_driver { #ifdef CONFIG_SURFACE_AGGREGATOR_BUS -extern struct bus_type ssam_bus_type; extern const struct device_type ssam_device_type; /** -- cgit v1.2.3 From 0c4b2255b7afbcc80f4efcc8f67425162f49c263 Mon Sep 17 00:00:00 2001 From: Jay Buddhabhatti Date: Mon, 18 Dec 2023 21:50:24 -0800 Subject: firmware: xilinx: Export function to use in other module Export zynqmp_pm_get_family_info() to access and find family information in other module. Signed-off-by: Jay Buddhabhatti Link: https://lore.kernel.org/r/20231219055025.27570-2-jay.buddhabhatti@amd.com Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware/xlnx-zynqmp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 1478f691cc10e..06f4e6eaf13e5 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -524,6 +524,7 @@ int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 *ret_payload, u32 num_args, ...); #if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE) int zynqmp_pm_get_api_version(u32 *version); int zynqmp_pm_get_chipid(u32 *idcode, u32 *version); +int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily); int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, u32 *out); int zynqmp_pm_clock_enable(u32 clock_id); int zynqmp_pm_clock_disable(u32 clock_id); @@ -602,6 +603,11 @@ static inline int zynqmp_pm_get_chipid(u32 *idcode, u32 *version) return -ENODEV; } +static inline int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily) +{ + return -ENODEV; +} + static inline int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, u32 *out) { -- cgit v1.2.3 From 97d62760e441af9ed393e127a46172f9534b5808 Mon Sep 17 00:00:00 2001 From: Jay Buddhabhatti Date: Mon, 18 Dec 2023 21:50:25 -0800 Subject: drivers: soc: xilinx: add check for platform Some error event IDs for Versal and Versal NET are different. Both the platforms should access their respective error event IDs so use sub_family_code to check for platform and check error IDs for respective platforms. The family code is passed via platform data to avoid platform detection again. Platform data is setup when even driver is registered. Signed-off-by: Jay Buddhabhatti Link: https://lore.kernel.org/r/20231219055025.27570-3-jay.buddhabhatti@amd.com Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware/xlnx-zynqmp.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 06f4e6eaf13e5..9a7e527392512 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -95,10 +95,18 @@ /* * Node IDs for the Error Events. */ -#define EVENT_ERROR_PMC_ERR1 (0x28100000U) -#define EVENT_ERROR_PMC_ERR2 (0x28104000U) -#define EVENT_ERROR_PSM_ERR1 (0x28108000U) -#define EVENT_ERROR_PSM_ERR2 (0x2810C000U) +#define VERSAL_EVENT_ERROR_PMC_ERR1 (0x28100000U) +#define VERSAL_EVENT_ERROR_PMC_ERR2 (0x28104000U) +#define VERSAL_EVENT_ERROR_PSM_ERR1 (0x28108000U) +#define VERSAL_EVENT_ERROR_PSM_ERR2 (0x2810C000U) + +#define VERSAL_NET_EVENT_ERROR_PMC_ERR1 (0x28100000U) +#define VERSAL_NET_EVENT_ERROR_PMC_ERR2 (0x28104000U) +#define VERSAL_NET_EVENT_ERROR_PMC_ERR3 (0x28108000U) +#define VERSAL_NET_EVENT_ERROR_PSM_ERR1 (0x2810C000U) +#define VERSAL_NET_EVENT_ERROR_PSM_ERR2 (0x28110000U) +#define VERSAL_NET_EVENT_ERROR_PSM_ERR3 (0x28114000U) +#define VERSAL_NET_EVENT_ERROR_PSM_ERR4 (0x28118000U) /* ZynqMP SD tap delay tuning */ #define SD_ITAPDLY 0xFF180314 -- cgit v1.2.3 From 98e20e5e13d2811898921f999288be7151a11954 Mon Sep 17 00:00:00 2001 From: Quentin Deslandes Date: Tue, 26 Dec 2023 14:07:42 +0100 Subject: bpfilter: remove bpfilter bpfilter was supposed to convert iptables filtering rules into BPF programs on the fly, from the kernel, through a usermode helper. The base code for the UMH was introduced in 2018, and couple of attempts (2, 3) tried to introduce the BPF program generate features but were abandoned. bpfilter now sits in a kernel tree unused and unusable, occasionally causing confusion amongst Linux users (4, 5). As bpfilter is now developed in a dedicated repository on GitHub (6), it was suggested a couple of times this year (LSFMM/BPF 2023, LPC 2023) to remove the deprecated kernel part of the project. This is the purpose of this patch. [1]: https://lore.kernel.org/lkml/20180522022230.2492505-1-ast@kernel.org/ [2]: https://lore.kernel.org/bpf/20210829183608.2297877-1-me@ubique.spb.ru/#t [3]: https://lore.kernel.org/lkml/20221224000402.476079-1-qde@naccy.de/ [4]: https://dxuuu.xyz/bpfilter.html [5]: https://github.com/linuxkit/linuxkit/pull/3904 [6]: https://github.com/facebook/bpfilter Signed-off-by: Quentin Deslandes Link: https://lore.kernel.org/r/20231226130745.465988-1-qde@naccy.de Signed-off-by: Alexei Starovoitov --- include/linux/bpfilter.h | 24 ------------------------ 1 file changed, 24 deletions(-) delete mode 100644 include/linux/bpfilter.h (limited to 'include/linux') diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h deleted file mode 100644 index 736ded4905e09..0000000000000 --- a/include/linux/bpfilter.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_BPFILTER_H -#define _LINUX_BPFILTER_H - -#include -#include -#include - -struct sock; -int bpfilter_ip_set_sockopt(struct sock *sk, int optname, sockptr_t optval, - unsigned int optlen); -int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, - int __user *optlen); - -struct bpfilter_umh_ops { - struct umd_info info; - /* since ip_getsockopt() can run in parallel, serialize access to umh */ - struct mutex lock; - int (*sockopt)(struct sock *sk, int optname, sockptr_t optval, - unsigned int optlen, bool is_set); - int (*start)(void); -}; -extern struct bpfilter_umh_ops bpfilter_ops; -#endif -- cgit v1.2.3 From 57331a59ac0d680f606403eb24edd3c35aecba31 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Thu, 4 Jan 2024 09:58:46 -0500 Subject: NFSv4.1: Use the nfs_client's rpc timeouts for backchannel For backchannel requests that lookup the appropriate nfs_client, use the state-management rpc_clnt's rpc_timeout parameters for the backchannel's response. When the nfs_client cannot be found, fall back to using the xprt's default timeout parameters. Signed-off-by: Benjamin Coddington Tested-by: Chuck Lever Tested-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/sunrpc/bc_xprt.h | 3 ++- include/linux/sunrpc/sched.h | 14 +++++++++++++- include/linux/sunrpc/svc.h | 2 ++ include/linux/sunrpc/xprt.h | 11 ----------- 4 files changed, 17 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h index db30a159f9d5f..f22bf915dcf6e 100644 --- a/include/linux/sunrpc/bc_xprt.h +++ b/include/linux/sunrpc/bc_xprt.h @@ -20,7 +20,8 @@ #ifdef CONFIG_SUNRPC_BACKCHANNEL struct rpc_rqst *xprt_lookup_bc_request(struct rpc_xprt *xprt, __be32 xid); void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied); -void xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task); +void xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task, + const struct rpc_timeout *to); void xprt_free_bc_request(struct rpc_rqst *req); int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs); void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs); diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 8ada7dc802d30..2d61987b35456 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -37,6 +37,17 @@ struct rpc_wait { struct list_head timer_list; /* Timer list */ }; +/* + * This describes a timeout strategy + */ +struct rpc_timeout { + unsigned long to_initval, /* initial timeout */ + to_maxval, /* max timeout */ + to_increment; /* if !exponential */ + unsigned int to_retries; /* max # of retries */ + unsigned char to_exponential; +}; + /* * This is the RPC task struct */ @@ -205,7 +216,8 @@ struct rpc_wait_queue { */ struct rpc_task *rpc_new_task(const struct rpc_task_setup *); struct rpc_task *rpc_run_task(const struct rpc_task_setup *); -struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req); +struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req, + struct rpc_timeout *timeout); void rpc_put_task(struct rpc_task *); void rpc_put_task_async(struct rpc_task *); bool rpc_task_set_rpc_status(struct rpc_task *task, int rpc_status); diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index b10f987509cc8..3331a1c2b47e6 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -250,6 +250,8 @@ struct svc_rqst { struct net *rq_bc_net; /* pointer to backchannel's * net namespace */ + unsigned long bc_to_initval; + unsigned int bc_to_retries; void ** rq_lease_breaker; /* The v4 client breaking a lease */ unsigned int rq_status_counter; /* RPC processing counter */ }; diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index f85d3a0daca26..464f6a9492ab7 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -30,17 +30,6 @@ #define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT) #define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd) -/* - * This describes a timeout strategy - */ -struct rpc_timeout { - unsigned long to_initval, /* initial timeout */ - to_maxval, /* max timeout */ - to_increment; /* if !exponential */ - unsigned int to_retries; /* max # of retries */ - unsigned char to_exponential; -}; - enum rpc_display_format_t { RPC_DISPLAY_ADDR = 0, RPC_DISPLAY_PORT, -- cgit v1.2.3 From fe1eb24bd5ade085914248c527044e942f75e06a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 4 Jan 2024 16:04:35 -0800 Subject: Revert "Introduce PHY listing and link_topology tracking" This reverts commit 32bb4515e34469975abc936deb0a116c4a445817. This reverts commit d078d480639a4f3b5fc2d56247afa38e0956483a. This reverts commit fcc4b105caa4b844bf043375bf799c20a9c99db1. This reverts commit 345237dbc1bdbb274c9fb9ec38976261ff4a40b8. This reverts commit 7db69ec9cfb8b4ab50420262631fb2d1908b25bf. This reverts commit 95132a018f00f5dad38bdcfd4180d1af955d46f6. This reverts commit 63d5eaf35ac36cad00cfb3809d794ef0078c822b. This reverts commit c29451aefcb42359905d18678de38e52eccb3bb5. This reverts commit 2ab0edb505faa9ac90dee1732571390f074e8113. This reverts commit dedd702a35793ab462fce4c737eeba0badf9718e. This reverts commit 034fcc210349b873ece7356905be5c6ca11eef2a. This reverts commit 9c5625f559ad6fe9f6f733c11475bf470e637d34. This reverts commit 02018c544ef113e980a2349eba89003d6f399d22. Looks like we need more time for reviews, and incremental changes will be hard to make sense of. So revert. Link: https://lore.kernel.org/all/ZZP6FV5sXEf+xd58@shell.armlinux.org.uk/ Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 4 +- include/linux/phy.h | 6 --- include/linux/phy_link_topology.h | 67 ---------------------------------- include/linux/phy_link_topology_core.h | 19 ---------- include/linux/sfp.h | 8 +--- 5 files changed, 2 insertions(+), 102 deletions(-) delete mode 100644 include/linux/phy_link_topology.h delete mode 100644 include/linux/phy_link_topology_core.h (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e265aa1f21699..118c40258d07b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -40,6 +40,7 @@ #include #endif #include + #include #include #include @@ -51,7 +52,6 @@ #include #include #include -#include struct netpoll_info; struct device; @@ -2047,7 +2047,6 @@ enum netdev_stat_type { * @fcoe_ddp_xid: Max exchange id for FCoE LRO by ddp * * @priomap: XXX: need comments on this one - * @link_topo: Physical link topology tracking attached PHYs * @phydev: Physical device may attach itself * for hardware timestamping * @sfp_bus: attached &struct sfp_bus structure. @@ -2442,7 +2441,6 @@ struct net_device { #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) struct netprio_map __rcu *priomap; #endif - struct phy_link_topology link_topo; struct phy_device *phydev; struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; diff --git a/include/linux/phy.h b/include/linux/phy.h index 6cb9d843aee93..e9e85d3475872 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -544,9 +544,6 @@ struct macsec_ops; * @drv: Pointer to the driver for this PHY instance * @devlink: Create a link between phy dev and mac dev, if the external phy * used by current mac interface is managed by another mac interface. - * @phyindex: Unique id across the phy's parent tree of phys to address the PHY - * from userspace, similar to ifindex. A zero index means the PHY - * wasn't assigned an id yet. * @phy_id: UID for this device found during discovery * @c45_ids: 802.3-c45 Device Identifiers if is_c45. * @is_c45: Set to true if this PHY uses clause 45 addressing. @@ -646,7 +643,6 @@ struct phy_device { struct device_link *devlink; - u32 phyindex; u32 phy_id; struct phy_c45_device_ids c45_ids; @@ -1726,8 +1722,6 @@ int phy_suspend(struct phy_device *phydev); int phy_resume(struct phy_device *phydev); int __phy_resume(struct phy_device *phydev); int phy_loopback(struct phy_device *phydev, bool enable); -int phy_sfp_connect_phy(void *upstream, struct phy_device *phy); -void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy); void phy_sfp_attach(void *upstream, struct sfp_bus *bus); void phy_sfp_detach(void *upstream, struct sfp_bus *bus); int phy_sfp_probe(struct phy_device *phydev, diff --git a/include/linux/phy_link_topology.h b/include/linux/phy_link_topology.h deleted file mode 100644 index 91902263ec0ef..0000000000000 --- a/include/linux/phy_link_topology.h +++ /dev/null @@ -1,67 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PHY device list allow maintaining a list of PHY devices that are - * part of a netdevice's link topology. PHYs can for example be chained, - * as is the case when using a PHY that exposes an SFP module, on which an - * SFP transceiver that embeds a PHY is connected. - * - * This list can then be used by userspace to leverage individual PHY - * capabilities. - */ -#ifndef __PHY_LINK_TOPOLOGY_H -#define __PHY_LINK_TOPOLOGY_H - -#include -#include - -struct xarray; -struct phy_device; -struct net_device; -struct sfp_bus; - -struct phy_device_node { - enum phy_upstream upstream_type; - - union { - struct net_device *netdev; - struct phy_device *phydev; - } upstream; - - struct sfp_bus *parent_sfp_bus; - - struct phy_device *phy; -}; - -static inline struct phy_device * -phy_link_topo_get_phy(struct phy_link_topology *topo, u32 phyindex) -{ - struct phy_device_node *pdn = xa_load(&topo->phys, phyindex); - - if (pdn) - return pdn->phy; - - return NULL; -} - -#if IS_ENABLED(CONFIG_PHYLIB) -int phy_link_topo_add_phy(struct phy_link_topology *topo, - struct phy_device *phy, - enum phy_upstream upt, void *upstream); - -void phy_link_topo_del_phy(struct phy_link_topology *lt, struct phy_device *phy); - -#else -static inline int phy_link_topo_add_phy(struct phy_link_topology *topo, - struct phy_device *phy, - enum phy_upstream upt, void *upstream) -{ - return 0; -} - -static inline void phy_link_topo_del_phy(struct phy_link_topology *topo, - struct phy_device *phy) -{ -} -#endif - -#endif /* __PHY_LINK_TOPOLOGY_H */ diff --git a/include/linux/phy_link_topology_core.h b/include/linux/phy_link_topology_core.h deleted file mode 100644 index 78c75f9094897..0000000000000 --- a/include/linux/phy_link_topology_core.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __PHY_LINK_TOPOLOGY_CORE_H -#define __PHY_LINK_TOPOLOGY_CORE_H - -struct xarray; - -struct phy_link_topology { - struct xarray phys; - - u32 next_phy_index; -}; - -static inline void phy_link_topo_init(struct phy_link_topology *topo) -{ - xa_init_flags(&topo->phys, XA_FLAGS_ALLOC1); - topo->next_phy_index = 1; -} - -#endif /* __PHY_LINK_TOPOLOGY_CORE_H */ diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 55c0ab17c9e2e..9346cd44814d6 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -544,7 +544,7 @@ struct sfp_upstream_ops { void (*link_down)(void *priv); void (*link_up)(void *priv); int (*connect_phy)(void *priv, struct phy_device *); - void (*disconnect_phy)(void *priv, struct phy_device *); + void (*disconnect_phy)(void *priv); }; #if IS_ENABLED(CONFIG_SFP) @@ -570,7 +570,6 @@ struct sfp_bus *sfp_bus_find_fwnode(const struct fwnode_handle *fwnode); int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, const struct sfp_upstream_ops *ops); void sfp_bus_del_upstream(struct sfp_bus *bus); -const char *sfp_get_name(struct sfp_bus *bus); #else static inline int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, @@ -649,11 +648,6 @@ static inline int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, static inline void sfp_bus_del_upstream(struct sfp_bus *bus) { } - -static inline const char *sfp_get_name(struct sfp_bus *bus) -{ - return NULL; -} #endif #endif -- cgit v1.2.3 From 19bfcdf9498aa968ea293417fbbc39e523527ca8 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Wed, 3 Jan 2024 20:05:44 +0100 Subject: bpf: Relax tracing prog recursive attach rules Currently, it's not allowed to attach an fentry/fexit prog to another one fentry/fexit. At the same time it's not uncommon to see a tracing program with lots of logic in use, and the attachment limitation prevents usage of fentry/fexit for performance analysis (e.g. with "bpftool prog profile" command) in this case. An example could be falcosecurity libs project that uses tp_btf tracing programs. Following the corresponding discussion [1], the reason for that is to avoid tracing progs call cycles without introducing more complex solutions. But currently it seems impossible to load and attach tracing programs in a way that will form such a cycle. The limitation is coming from the fact that attach_prog_fd is specified at the prog load (thus making it impossible to attach to a program loaded after it in this way), as well as tracing progs not implementing link_detach. Replace "no same type" requirement with verification that no more than one level of attachment nesting is allowed. In this way only one fentry/fexit program could be attached to another fentry/fexit to cover profiling use case, and still no cycle could be formed. To implement, add a new field into bpf_prog_aux to track nested attachment for tracing programs. [1]: https://lore.kernel.org/bpf/20191108064039.2041889-16-ast@kernel.org/ Acked-by: Jiri Olsa Acked-by: Song Liu Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com> Link: https://lore.kernel.org/r/20240103190559.14750-2-9erthalion6@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7671530d6e4e0..e30100597d0a9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1449,6 +1449,7 @@ struct bpf_prog_aux { bool dev_bound; /* Program is bound to the netdev. */ bool offload_requested; /* Program is bound and offloaded to the netdev. */ bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ + bool attach_tracing_prog; /* true if tracing another tracing program */ bool func_proto_unreliable; bool sleepable; bool tail_call_reachable; -- cgit v1.2.3 From 990b6b5b13b7993b7f44740c0add3119d407ccbf Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 13 Dec 2023 09:32:20 +0800 Subject: jbd2: add errseq to detect client fs's bdev writeback error Add errseq in journal, so that JBD2 can detect whether metadata is successfully written to fs bdev. This patch adds detection in recovery process to replace original solution(using local variable wb_err). Signed-off-by: Zhihao Cheng Suggested-by: Jan Kara Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231213013224.2100050-2-chengzhihao1@huawei.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index beb30719ee161..cea1aa70ae36f 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -998,6 +998,13 @@ struct journal_s */ struct block_device *j_fs_dev; + /** + * @j_fs_dev_wb_err: + * + * Records the errseq of the client fs's backing block device. + */ + errseq_t j_fs_dev_wb_err; + /** * @j_total_len: Total maximum capacity of the journal region on disk. */ @@ -1698,6 +1705,25 @@ static inline void jbd2_journal_abort_handle(handle_t *handle) handle->h_aborted = 1; } +static inline void jbd2_init_fs_dev_write_error(journal_t *journal) +{ + struct address_space *mapping = journal->j_fs_dev->bd_inode->i_mapping; + + /* + * Save the original wb_err value of client fs's bdev mapping which + * could be used to detect the client fs's metadata async write error. + */ + errseq_check_and_advance(&mapping->wb_err, &journal->j_fs_dev_wb_err); +} + +static inline int jbd2_check_fs_dev_write_error(journal_t *journal) +{ + struct address_space *mapping = journal->j_fs_dev->bd_inode->i_mapping; + + return errseq_check(&mapping->wb_err, + READ_ONCE(journal->j_fs_dev_wb_err)); +} + #endif /* __KERNEL__ */ /* Comparison functions for transaction IDs: perform comparisons using -- cgit v1.2.3 From 8a4fd33d879fb303b207f06ea6340d73f698c4ed Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 13 Dec 2023 09:32:22 +0800 Subject: jbd2: remove unused 'JBD2_CHECKPOINT_IO_ERROR' and 'j_atomic_flags' Since 'JBD2_CHECKPOINT_IO_ERROR' and j_atomic_flags' are not useful anymore after fs dev's errseq is imported into jbd2, just remove them. Signed-off-by: Zhihao Cheng Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231213013224.2100050-4-chengzhihao1@huawei.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index cea1aa70ae36f..971f3e826e152 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -755,11 +755,6 @@ struct journal_s */ unsigned long j_flags; - /** - * @j_atomic_flags: Atomic journaling state flags. - */ - unsigned long j_atomic_flags; - /** * @j_errno: * @@ -1406,12 +1401,6 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) #define JBD2_JOURNAL_FLUSH_VALID (JBD2_JOURNAL_FLUSH_DISCARD | \ JBD2_JOURNAL_FLUSH_ZEROOUT) -/* - * Journal atomic flag definitions - */ -#define JBD2_CHECKPOINT_IO_ERROR 0x001 /* Detect io error while writing - * buffer back to disk */ - /* * Function declarations for the journaling transaction and buffer * management -- cgit v1.2.3 From c2e64baac4f36f7e0365218c7523bb9ba4639250 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 21 Dec 2023 11:08:02 +0100 Subject: pwm: Add pwm_apply_state() compatibility stub MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to make the transition to the new pwm_apply_might_sleep() a bit smoother, add a compatibility stub. This will prevent new calls to the old function introduced via other subsystems from breaking builds. Once the next merge window has closed we can take another stab at removing the stub. Reviewed-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- include/linux/pwm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 5dd665d8c909e..2a5e1154652e8 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -537,6 +537,13 @@ static inline void pwm_apply_args(struct pwm_device *pwm) pwm_apply_might_sleep(pwm, &state); } +/* only for backwards-compatibility, new code should not use this */ +static inline int pwm_apply_state(struct pwm_device *pwm, + const struct pwm_state *state) +{ + return pwm_apply_might_sleep(pwm, state); +} + struct pwm_lookup { struct list_head list; const char *provider; -- cgit v1.2.3 From d73f444d06fb8a42a5c0623453f3ea1fe9880229 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 22 Dec 2023 21:06:20 -0800 Subject: pwm: linux/pwm.h: fix Excess kernel-doc description warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the @pwm: line to prevent the kernel-doc warning: include/linux/pwm.h:87: warning: Excess struct member 'pwm' description in 'pwm_device' Signed-off-by: Randy Dunlap Cc: Thierry Reding Cc: Uwe Kleine-König Cc: Fixes: f3e25e68ceb2 ("pwm: Drop unused member "pwm" from struct pwm_device") Reviewed-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- include/linux/pwm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 2a5e1154652e8..fcc2c4496f731 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -69,7 +69,6 @@ struct pwm_state { * @label: name of the PWM device * @flags: flags associated with the PWM device * @hwpwm: per-chip relative index of the PWM device - * @pwm: global index of the PWM device * @chip: PWM chip providing this PWM device * @args: PWM arguments * @state: last applied state -- cgit v1.2.3 From 92a714d727ec9e7ccfcc7432d348aba730145914 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 4 Jan 2024 15:52:11 +0000 Subject: netfs: Fix interaction between write-streaming and cachefiles culling An issue can occur between write-streaming (storing dirty data in partial non-uptodate pages) and a cachefiles object being culled to make space. The problem occurs because the cache object is only marked in use while there are files open using it. Once it has been released, it can be culled and the cookie marked disabled. At this point, a streaming write is permitted to occur (if the cache is active, we require pages to be prefetched and cached), but the cache can become active again before this gets flushed out - and then two effects can occur: (1) The cache may be asked to write out a region that's less than its DIO block size (assumed by cachefiles to be PAGE_SIZE) - and this causes one of two debugging statements to be emitted. (2) netfs_how_to_modify() gets confused because it sees a page that isn't allowed to be non-uptodate being uptodate and tries to prefetch it - leading to a warning that PG_fscache is set twice. Fix this by the following means: (1) Add a netfs_inode flag to disallow write-streaming to an inode and set it if we ever do local caching of that inode. It remains set for the lifetime of that inode - even if the cookie becomes disabled. (2) If the no-write-streaming flag is set, then make netfs_how_to_modify() always want to prefetch instead. (3) If netfs_how_to_modify() decides it wants to prefetch a folio, but that folio has write-streamed data in it, then it requires the folio be flushed first. (4) Export a counter of the number of times we wanted to prefetch a non-uptodate page, but found it had write-streamed data in it. (5) Export a counter of the number of times we cancelled a write to the cache because it didn't DIO align and remove the debug statements. Reported-by: Marc Dionne Signed-off-by: David Howells cc: Jeff Layton cc: linux-cachefs@redhat.com cc: linux-erofs@lists.ozlabs.org cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org --- include/linux/fscache-cache.h | 3 +++ include/linux/netfs.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index a174cedf4d907..bdf7f3eddf0a2 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -189,17 +189,20 @@ extern atomic_t fscache_n_write; extern atomic_t fscache_n_no_write_space; extern atomic_t fscache_n_no_create_space; extern atomic_t fscache_n_culled; +extern atomic_t fscache_n_dio_misfit; #define fscache_count_read() atomic_inc(&fscache_n_read) #define fscache_count_write() atomic_inc(&fscache_n_write) #define fscache_count_no_write_space() atomic_inc(&fscache_n_no_write_space) #define fscache_count_no_create_space() atomic_inc(&fscache_n_no_create_space) #define fscache_count_culled() atomic_inc(&fscache_n_culled) +#define fscache_count_dio_misfit() atomic_inc(&fscache_n_dio_misfit) #else #define fscache_count_read() do {} while(0) #define fscache_count_write() do {} while(0) #define fscache_count_no_write_space() do {} while(0) #define fscache_count_no_create_space() do {} while(0) #define fscache_count_culled() do {} while(0) +#define fscache_count_dio_misfit() do {} while(0) #endif #endif /* _LINUX_FSCACHE_CACHE_H */ diff --git a/include/linux/netfs.h b/include/linux/netfs.h index d3bac60fcd6f3..100cbb261269d 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -142,6 +142,7 @@ struct netfs_inode { #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ #define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ +#define NETFS_ICTX_NO_WRITE_STREAMING 3 /* Don't engage in write-streaming */ }; /* -- cgit v1.2.3 From 8a6286c1804e2c7144aef3154a0357c4b496e10b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 3 Jan 2024 14:28:36 +0100 Subject: dpll: expose fractional frequency offset value to user Add a new netlink attribute to expose fractional frequency offset value for a pin. Add an op to get the value from the driver. Signed-off-by: Jiri Pirko Acked-by: Vadim Fedorenko Acked-by: Arkadiusz Kubalewski Link: https://lore.kernel.org/r/20240103132838.1501801-2-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- include/linux/dpll.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dpll.h b/include/linux/dpll.h index b1a5f9ca8ee5d..9cf896ea1d412 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -77,6 +77,9 @@ struct dpll_pin_ops { const struct dpll_device *dpll, void *dpll_priv, const s32 phase_adjust, struct netlink_ext_ack *extack); + int (*ffo_get)(const struct dpll_pin *pin, void *pin_priv, + const struct dpll_device *dpll, void *dpll_priv, + s64 *ffo, struct netlink_ext_ack *extack); }; struct dpll_pin_frequency { -- cgit v1.2.3 From 9c5938694cd0e9e00bdfb7e60900673263daf4d5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 5 Jan 2024 16:57:29 +0100 Subject: mm/rmap: silence VM_WARN_ON_FOLIO() in __folio_rmap_sanity_checks() Unfortunately, vm_insert_page() and friends and up passing driver-allocated folios into folio_add_file_rmap_pte() using insert_page_into_pte_locked(). While these driver-allocated folios can be compound pages (large folios), they are not proper "rmappable" folios. In these VM_MIXEDMAP VMAs, there isn't really the concept of a reverse mapping, so long-term, we should clean that up and not call into rmap code. For the time being, document how we can end up in rmap code with large folios that are not marked rmappable. Link: https://lkml.kernel.org/r/793c5cee-d5fc-4eb1-86a2-39e05686233d@redhat.com Fixes: 68f0320824fa ("mm/rmap: convert folio_add_file_rmap_range() into folio_add_file_rmap_[pte|ptes|pmd]()") Reported-by: syzbot+50ef73537bbc393a25bb@syzkaller.appspotmail.com Closes: https://lkml.kernel.org/r/000000000000014174060e09316e@google.com Signed-off-by: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/rmap.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index fd6fe16fa3583..b7944a833668a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -199,8 +199,15 @@ static inline void __folio_rmap_sanity_checks(struct folio *folio, { /* hugetlb folios are handled separately. */ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); - VM_WARN_ON_FOLIO(folio_test_large(folio) && - !folio_test_large_rmappable(folio), folio); + + /* + * TODO: we get driver-allocated folios that have nothing to do with + * the rmap using vm_insert_page(); therefore, we cannot assume that + * folio_test_large_rmappable() holds for large folios. We should + * handle any desired mapcount+stats accounting for these folios in + * VM_MIXEDMAP VMAs separately, and then sanity-check here that + * we really only get rmappable folios. + */ VM_WARN_ON_ONCE(nr_pages <= 0); VM_WARN_ON_FOLIO(page_folio(page) != folio, folio); -- cgit v1.2.3 From 71ce1ab54a505736786d9c5921e6c2718c7ec535 Mon Sep 17 00:00:00 2001 From: Kinsey Ho Date: Wed, 27 Dec 2023 14:12:01 +0000 Subject: mm/mglru: add CONFIG_ARCH_HAS_HW_PTE_YOUNG Patch series "mm/mglru: Kconfig cleanup", v4. This series is the result of the following discussion: https://lore.kernel.org/47066176-bd93-55dd-c2fa-002299d9e034@linux.ibm.com/ It mainly avoids building the code that walks page tables on CPUs that use it, i.e., those don't support hardware accessed bit. Specifically, it introduces a new Kconfig to guard some of functions added by commit bd74fdaea146 ("mm: multi-gen LRU: support page table walks") on CPUs like POWER9, on which the series was tested. This patch (of 5): Some architectures are able to set the accessed bit in PTEs when PTEs are used as part of linear address translations. Add CONFIG_ARCH_HAS_HW_PTE_YOUNG for such architectures to be able to override arch_has_hw_pte_young(). Link: https://lkml.kernel.org/r/20231227141205.2200125-1-kinseyho@google.com Link: https://lkml.kernel.org/r/20231227141205.2200125-2-kinseyho@google.com Signed-off-by: Kinsey Ho Co-developed-by: Aneesh Kumar K.V Signed-off-by: Aneesh Kumar K.V Tested-by: Donet Tom Acked-by: Yu Zhao Cc: kernel test robot Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index af7639c3b0a3a..9ecc20fa62696 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -375,7 +375,7 @@ static inline bool arch_has_hw_nonleaf_pmd_young(void) */ static inline bool arch_has_hw_pte_young(void) { - return false; + return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG); } #endif -- cgit v1.2.3 From 61dd3f246b3adaabff3241c586f2210ac91b05a4 Mon Sep 17 00:00:00 2001 From: Kinsey Ho Date: Wed, 27 Dec 2023 14:12:02 +0000 Subject: mm/mglru: add CONFIG_LRU_GEN_WALKS_MMU Add CONFIG_LRU_GEN_WALKS_MMU such that if disabled, the code that walks page tables to promote pages into the youngest generation will not be built. Also improves code readability by adding two helper functions get_mm_state() and get_next_mm(). Link: https://lkml.kernel.org/r/20231227141205.2200125-3-kinseyho@google.com Signed-off-by: Kinsey Ho Co-developed-by: Aneesh Kumar K.V Signed-off-by: Aneesh Kumar K.V Tested-by: Donet Tom Acked-by: Yu Zhao Cc: kernel test robot Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 +- include/linux/mm_types.h | 12 ++++++++---- include/linux/mmzone.h | 2 ++ 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5de775e6cdd91..20ff87f8e001d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -330,7 +330,7 @@ struct mem_cgroup { struct deferred_split deferred_split_queue; #endif -#ifdef CONFIG_LRU_GEN +#ifdef CONFIG_LRU_GEN_WALKS_MMU /* per-memcg mm_struct list */ struct lru_gen_mm_list mm_list; #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a66534c78c4dd..552fa2d11c57c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -958,7 +958,7 @@ struct mm_struct { */ unsigned long ksm_zero_pages; #endif /* CONFIG_KSM */ -#ifdef CONFIG_LRU_GEN +#ifdef CONFIG_LRU_GEN_WALKS_MMU struct { /* this mm_struct is on lru_gen_mm_list */ struct list_head list; @@ -973,7 +973,7 @@ struct mm_struct { struct mem_cgroup *memcg; #endif } lru_gen; -#endif /* CONFIG_LRU_GEN */ +#endif /* CONFIG_LRU_GEN_WALKS_MMU */ } __randomize_layout; /* @@ -1011,6 +1011,10 @@ struct lru_gen_mm_list { spinlock_t lock; }; +#endif /* CONFIG_LRU_GEN */ + +#ifdef CONFIG_LRU_GEN_WALKS_MMU + void lru_gen_add_mm(struct mm_struct *mm); void lru_gen_del_mm(struct mm_struct *mm); #ifdef CONFIG_MEMCG @@ -1036,7 +1040,7 @@ static inline void lru_gen_use_mm(struct mm_struct *mm) WRITE_ONCE(mm->lru_gen.bitmap, -1); } -#else /* !CONFIG_LRU_GEN */ +#else /* !CONFIG_LRU_GEN_WALKS_MMU */ static inline void lru_gen_add_mm(struct mm_struct *mm) { @@ -1060,7 +1064,7 @@ static inline void lru_gen_use_mm(struct mm_struct *mm) { } -#endif /* CONFIG_LRU_GEN */ +#endif /* CONFIG_LRU_GEN_WALKS_MMU */ struct vma_iterator { struct ma_state mas; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2efd3be484fdd..bc3f63ec42914 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -640,9 +640,11 @@ struct lruvec { #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ struct lru_gen_folio lrugen; +#ifdef CONFIG_LRU_GEN_WALKS_MMU /* to concurrently iterate lru_gen_mm_list */ struct lru_gen_mm_state mm_state; #endif +#endif /* CONFIG_LRU_GEN */ #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif -- cgit v1.2.3 From 745b13e647cd119e70d16b57698e12b7c86ca264 Mon Sep 17 00:00:00 2001 From: Kinsey Ho Date: Wed, 27 Dec 2023 14:12:03 +0000 Subject: mm/mglru: remove CONFIG_MEMCG Remove CONFIG_MEMCG in a refactoring to improve code readability at the cost of a few bytes in struct lru_gen_folio per node when CONFIG_MEMCG=n. Link: https://lkml.kernel.org/r/20231227141205.2200125-4-kinseyho@google.com Signed-off-by: Kinsey Ho Co-developed-by: Aneesh Kumar K.V Signed-off-by: Aneesh Kumar K.V Tested-by: Donet Tom Acked-by: Yu Zhao Cc: kernel test robot Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 4 ---- include/linux/mmzone.h | 26 ++------------------------ 2 files changed, 2 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 552fa2d11c57c..55b7121809ff9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1017,9 +1017,7 @@ struct lru_gen_mm_list { void lru_gen_add_mm(struct mm_struct *mm); void lru_gen_del_mm(struct mm_struct *mm); -#ifdef CONFIG_MEMCG void lru_gen_migrate_mm(struct mm_struct *mm); -#endif static inline void lru_gen_init_mm(struct mm_struct *mm) { @@ -1050,11 +1048,9 @@ static inline void lru_gen_del_mm(struct mm_struct *mm) { } -#ifdef CONFIG_MEMCG static inline void lru_gen_migrate_mm(struct mm_struct *mm) { } -#endif static inline void lru_gen_init_mm(struct mm_struct *mm) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index bc3f63ec42914..28665e1b84754 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -440,14 +440,12 @@ struct lru_gen_folio { atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* whether the multi-gen LRU is enabled */ bool enabled; -#ifdef CONFIG_MEMCG /* the memcg generation this lru_gen_folio belongs to */ u8 gen; /* the list segment this lru_gen_folio belongs to */ u8 seg; /* per-node lru_gen_folio list for global reclaim */ struct hlist_nulls_node list; -#endif }; enum { @@ -493,11 +491,6 @@ struct lru_gen_mm_walk { bool force_scan; }; -void lru_gen_init_lruvec(struct lruvec *lruvec); -void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); - -#ifdef CONFIG_MEMCG - /* * For each node, memcgs are divided into two generations: the old and the * young. For each generation, memcgs are randomly sharded into multiple bins @@ -555,6 +548,8 @@ struct lru_gen_memcg { }; void lru_gen_init_pgdat(struct pglist_data *pgdat); +void lru_gen_init_lruvec(struct lruvec *lruvec); +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw); void lru_gen_init_memcg(struct mem_cgroup *memcg); void lru_gen_exit_memcg(struct mem_cgroup *memcg); @@ -563,19 +558,6 @@ void lru_gen_offline_memcg(struct mem_cgroup *memcg); void lru_gen_release_memcg(struct mem_cgroup *memcg); void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid); -#else /* !CONFIG_MEMCG */ - -#define MEMCG_NR_GENS 1 - -struct lru_gen_memcg { -}; - -static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) -{ -} - -#endif /* CONFIG_MEMCG */ - #else /* !CONFIG_LRU_GEN */ static inline void lru_gen_init_pgdat(struct pglist_data *pgdat) @@ -590,8 +572,6 @@ static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { } -#ifdef CONFIG_MEMCG - static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) { } @@ -616,8 +596,6 @@ static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid) { } -#endif /* CONFIG_MEMCG */ - #endif /* CONFIG_LRU_GEN */ struct lruvec { -- cgit v1.2.3 From 533c67e6358406727145efae32882c4dc355d6c5 Mon Sep 17 00:00:00 2001 From: Kinsey Ho Date: Wed, 27 Dec 2023 14:12:04 +0000 Subject: mm/mglru: add dummy pmd_dirty() Add dummy pmd_dirty() for architectures that don't provide it. This is similar to commit 6617da8fb565 ("mm: add dummy pmd_young() for architectures not having it"). Link: https://lkml.kernel.org/r/20231227141205.2200125-5-kinseyho@google.com Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312210606.1Etqz3M4-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202312210042.xQEiqlEh-lkp@intel.com/ Signed-off-by: Kinsey Ho Suggested-by: Yu Zhao Cc: Aneesh Kumar K.V Cc: Donet Tom Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 9ecc20fa62696..466cf477551a8 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -184,6 +184,13 @@ static inline int pmd_young(pmd_t pmd) } #endif +#ifndef pmd_dirty +static inline int pmd_dirty(pmd_t pmd) +{ + return 0; +} +#endif + /* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode -- cgit v1.2.3 From e435ca87882167dda78776ce4bd6eb2094eb864b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 28 Dec 2023 08:57:43 +0000 Subject: mm: remove inc/dec lruvec page state functions Patch series "Remove some lruvec page accounting functions", v2. Some functions are now unused; remove them. Make __mod_lruvec_page_state() unused and then remove it. This patch (of 6): All callers of these have been converted to their folio equivalents. Link: https://lkml.kernel.org/r/20231228085748.1083901-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231228085748.1083901-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Johannes Weiner Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index fed855bae6d8e..147ae73e0ee7b 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -597,18 +597,6 @@ static inline void mod_lruvec_page_state(struct page *page, #endif /* CONFIG_MEMCG */ -static inline void __inc_lruvec_page_state(struct page *page, - enum node_stat_item idx) -{ - __mod_lruvec_page_state(page, idx, 1); -} - -static inline void __dec_lruvec_page_state(struct page *page, - enum node_stat_item idx) -{ - __mod_lruvec_page_state(page, idx, -1); -} - static inline void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { @@ -627,18 +615,6 @@ static inline void __lruvec_stat_sub_folio(struct folio *folio, __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); } -static inline void inc_lruvec_page_state(struct page *page, - enum node_stat_item idx) -{ - mod_lruvec_page_state(page, idx, 1); -} - -static inline void dec_lruvec_page_state(struct page *page, - enum node_stat_item idx) -{ - mod_lruvec_page_state(page, idx, -1); -} - static inline void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { -- cgit v1.2.3 From c701123bd68bf1cc3bc167b4f597cb1f4995c39c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 28 Dec 2023 08:57:48 +0000 Subject: mm/memcontrol: remove __mod_lruvec_page_state() There are no more callers of __mod_lruvec_page_state(), so convert the implementation to __lruvec_stat_mod_folio(), removing two calls to compound_head() (one explicit, one hidden inside page_memcg()). Link: https://lkml.kernel.org/r/20231228085748.1083901-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Acked-by: Shakeel Butt Reviewed-by: Vlastimil Babka Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Johannes Weiner Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 147ae73e0ee7b..343906a98d6ee 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -556,19 +556,25 @@ static inline void mod_lruvec_state(struct lruvec *lruvec, local_irq_restore(flags); } -void __mod_lruvec_page_state(struct page *page, +void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val); -static inline void mod_lruvec_page_state(struct page *page, +static inline void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); - __mod_lruvec_page_state(page, idx, val); + __lruvec_stat_mod_folio(folio, idx, val); local_irq_restore(flags); } +static inline void mod_lruvec_page_state(struct page *page, + enum node_stat_item idx, int val) +{ + lruvec_stat_mod_folio(page_folio(page), idx, val); +} + #else static inline void __mod_lruvec_state(struct lruvec *lruvec, @@ -583,10 +589,16 @@ static inline void mod_lruvec_state(struct lruvec *lruvec, mod_node_page_state(lruvec_pgdat(lruvec), idx, val); } -static inline void __mod_lruvec_page_state(struct page *page, - enum node_stat_item idx, int val) +static inline void __lruvec_stat_mod_folio(struct folio *folio, + enum node_stat_item idx, int val) { - __mod_node_page_state(page_pgdat(page), idx, val); + __mod_node_page_state(folio_pgdat(folio), idx, val); +} + +static inline void lruvec_stat_mod_folio(struct folio *folio, + enum node_stat_item idx, int val) +{ + mod_node_page_state(folio_pgdat(folio), idx, val); } static inline void mod_lruvec_page_state(struct page *page, @@ -597,12 +609,6 @@ static inline void mod_lruvec_page_state(struct page *page, #endif /* CONFIG_MEMCG */ -static inline void __lruvec_stat_mod_folio(struct folio *folio, - enum node_stat_item idx, int val) -{ - __mod_lruvec_page_state(&folio->page, idx, val); -} - static inline void __lruvec_stat_add_folio(struct folio *folio, enum node_stat_item idx) { @@ -615,12 +621,6 @@ static inline void __lruvec_stat_sub_folio(struct folio *folio, __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); } -static inline void lruvec_stat_mod_folio(struct folio *folio, - enum node_stat_item idx, int val) -{ - mod_lruvec_page_state(&folio->page, idx, val); -} - static inline void lruvec_stat_add_folio(struct folio *folio, enum node_stat_item idx) { -- cgit v1.2.3 From b805ab3c6935d14654ccc28f16ffce7a13c2c528 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 29 Dec 2023 10:26:51 +0800 Subject: mm/vmstat: move pgdemote_* out of CONFIG_NUMA_BALANCING Demotion can work well without CONFIG_NUMA_BALANCING. But the commit 23e9f0138963 ("mm/vmstat: move pgdemote_* to per-node stats") wrongly hid it behind CONFIG_NUMA_BALANCING. Fix it by moving them out of CONFIG_NUMA_BALANCING. Link: https://lkml.kernel.org/r/20231229022651.3229174-1-lizhijian@fujitsu.com Fixes: 23e9f0138963 ("mm/vmstat: move pgdemote_* to per-node stats") Signed-off-by: Li Zhijian Cc: "Huang, Ying" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 28665e1b84754..c18c53353b50f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -207,11 +207,11 @@ enum node_stat_item { #ifdef CONFIG_NUMA_BALANCING PGPROMOTE_SUCCESS, /* promote successfully */ PGPROMOTE_CANDIDATE, /* candidate pages to promote */ +#endif /* PGDEMOTE_*: pages demoted */ PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, -#endif NR_VM_NODE_STAT_ITEMS }; -- cgit v1.2.3 From 26a1a86dd093a10d0653429bf013dae6e95dccbf Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 20 Dec 2023 16:17:29 -0800 Subject: cxl/events: Promote CXL event structures to a core header UEFI code can process CXL events through CPER records. Those records use almost the same format as the CXL events. Lift the CXL event structures to a core header to be shared in later patches. [jic123: drop "CXL rev 3.0" mention] Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-2-1bb8a4ca2c7a@intel.com [djbw: add F: entry to maintainers for include/linux/cxl-event.h] Reviewed-by: Jonathan Cameron Acked-by: Ard Biesheuvel Signed-off-by: Dan Williams --- include/linux/cxl-event.h | 95 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 include/linux/cxl-event.h (limited to 'include/linux') diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h new file mode 100644 index 0000000000000..0fc068123f8ed --- /dev/null +++ b/include/linux/cxl-event.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2023 Intel Corporation. */ +#ifndef _LINUX_CXL_EVENT_H +#define _LINUX_CXL_EVENT_H + +/* + * Common Event Record Format + * CXL rev 3.0 section 8.2.9.2.1; Table 8-42 + */ +struct cxl_event_record_hdr { + uuid_t id; + u8 length; + u8 flags[3]; + __le16 handle; + __le16 related_handle; + __le64 timestamp; + u8 maint_op_class; + u8 reserved[15]; +} __packed; + +#define CXL_EVENT_RECORD_DATA_LENGTH 0x50 +struct cxl_event_record_raw { + struct cxl_event_record_hdr hdr; + u8 data[CXL_EVENT_RECORD_DATA_LENGTH]; +} __packed; + +/* + * General Media Event Record + * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 + */ +#define CXL_EVENT_GEN_MED_COMP_ID_SIZE 0x10 +struct cxl_event_gen_media { + struct cxl_event_record_hdr hdr; + __le64 phys_addr; + u8 descriptor; + u8 type; + u8 transaction_type; + u8 validity_flags[2]; + u8 channel; + u8 rank; + u8 device[3]; + u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE]; + u8 reserved[46]; +} __packed; + +/* + * DRAM Event Record - DER + * CXL rev 3.0 section 8.2.9.2.1.2; Table 3-44 + */ +#define CXL_EVENT_DER_CORRECTION_MASK_SIZE 0x20 +struct cxl_event_dram { + struct cxl_event_record_hdr hdr; + __le64 phys_addr; + u8 descriptor; + u8 type; + u8 transaction_type; + u8 validity_flags[2]; + u8 channel; + u8 rank; + u8 nibble_mask[3]; + u8 bank_group; + u8 bank; + u8 row[3]; + u8 column[2]; + u8 correction_mask[CXL_EVENT_DER_CORRECTION_MASK_SIZE]; + u8 reserved[0x17]; +} __packed; + +/* + * Get Health Info Record + * CXL rev 3.0 section 8.2.9.8.3.1; Table 8-100 + */ +struct cxl_get_health_info { + u8 health_status; + u8 media_status; + u8 add_status; + u8 life_used; + u8 device_temp[2]; + u8 dirty_shutdown_cnt[4]; + u8 cor_vol_err_cnt[4]; + u8 cor_per_err_cnt[4]; +} __packed; + +/* + * Memory Module Event Record + * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45 + */ +struct cxl_event_mem_module { + struct cxl_event_record_hdr hdr; + u8 event_type; + struct cxl_get_health_info info; + u8 reserved[0x3d]; +} __packed; + +#endif /* _LINUX_CXL_EVENT_H */ -- cgit v1.2.3 From 477bd4beb93bf9ace9bda71f1437b191befa9cf4 Mon Sep 17 00:00:00 2001 From: Swee Leong Ching Date: Fri, 5 Jan 2024 15:09:23 +0800 Subject: net: stmmac: Make MSI interrupt routine generic There is no support for per DMA channel interrupt for non-MSI platform, where the MAC's per channel interrupt hooks up to interrupt controller(GIC) through shared peripheral interrupt(SPI) to handle interrupt from TX/RX transmit channel. This patch generalize the existing MSI ISR to also support non-MSI platform. Signed-off-by: Teoh Ji Sheng Signed-off-by: Swee Leong Ching Signed-off-by: David S. Miller --- include/linux/stmmac.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index dee5ad6e48c5a..b950e6f9761de 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -98,7 +98,7 @@ struct stmmac_dma_cfg { int mixed_burst; bool aal; bool eame; - bool multi_msi_en; + bool multi_irq_en; bool dche; }; @@ -215,7 +215,7 @@ struct dwmac4_addrs { #define STMMAC_FLAG_TSO_EN BIT(4) #define STMMAC_FLAG_SERDES_UP_AFTER_PHY_LINKUP BIT(5) #define STMMAC_FLAG_VLAN_FAIL_Q_EN BIT(6) -#define STMMAC_FLAG_MULTI_MSI_EN BIT(7) +#define STMMAC_FLAG_MULTI_IRQ_EN BIT(7) #define STMMAC_FLAG_EXT_SNAPSHOT_EN BIT(8) #define STMMAC_FLAG_INT_SNAPSHOT_EN BIT(9) #define STMMAC_FLAG_RX_CLK_RUNS_IN_LPI BIT(10) -- cgit v1.2.3 From deb704281f076097b0347116a82edeba96697db1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 17 Nov 2023 17:14:27 -0500 Subject: SUNRPC: Add a server-side API for retrieving an RPC's pseudoflavor NFSD will use this new API to determine whether nfsd_splice_read is safe to use. This avoids the need to add a dependency to NFSD for CONFIG_SUNRPC_GSS. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svcauth.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h index 6f90203edbf8d..61c455f1e1f50 100644 --- a/include/linux/sunrpc/svcauth.h +++ b/include/linux/sunrpc/svcauth.h @@ -131,8 +131,11 @@ enum svc_auth_status { * This call releases a domain. * * set_client() - * Givens a pending request (struct svc_rqst), finds and assigns + * Given a pending request (struct svc_rqst), finds and assigns * an appropriate 'auth_domain' as the client. + * + * pseudoflavor() + * Returns RPC_AUTH pseudoflavor in use by @rqstp. */ struct auth_ops { char * name; @@ -143,11 +146,13 @@ struct auth_ops { int (*release)(struct svc_rqst *rqstp); void (*domain_release)(struct auth_domain *dom); enum svc_auth_status (*set_client)(struct svc_rqst *rqstp); + rpc_authflavor_t (*pseudoflavor)(struct svc_rqst *rqstp); }; struct svc_xprt; extern enum svc_auth_status svc_authenticate(struct svc_rqst *rqstp); +extern rpc_authflavor_t svc_auth_flavor(struct svc_rqst *rqstp); extern int svc_authorise(struct svc_rqst *rqstp); extern enum svc_auth_status svc_set_client(struct svc_rqst *rqstp); extern int svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops); -- cgit v1.2.3 From 3587b5c75376fd0b6ca8c4a8de54954e410f4e0e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 17 Nov 2023 17:14:46 -0500 Subject: SUNRPC: Remove RQ_SPLICE_OK This flag is no longer used. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index b10f987509cc8..544fcfe074799 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -260,8 +260,6 @@ enum { RQ_LOCAL, /* local request */ RQ_USEDEFERRAL, /* use deferral */ RQ_DROPME, /* drop current reply */ - RQ_SPLICE_OK, /* turned off in gss privacy to prevent - * encrypting page cache pages */ RQ_VICTIM, /* Have agreed to shut down */ RQ_DATA, /* request has data */ }; -- cgit v1.2.3 From 52e89100754b2e888cb63bf2d19e65d809497cd6 Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Sat, 2 Dec 2023 21:07:25 +0000 Subject: NFSv4, NFSD: move enum nfs_cb_opnum4 to include/linux/nfs4.h Callback operations enum is defined in client and server, move it to common header file. Signed-off-by: ChenXiaoSong Acked-by: Anna Schumaker Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/nfs4.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index c11c4db346393..ef8d2d618d5b3 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -869,4 +869,26 @@ enum { RCA4_TYPE_MASK_OTHER_LAYOUT_MAX = 15, }; +enum nfs_cb_opnum4 { + OP_CB_GETATTR = 3, + OP_CB_RECALL = 4, + + /* Callback operations new to NFSv4.1 */ + OP_CB_LAYOUTRECALL = 5, + OP_CB_NOTIFY = 6, + OP_CB_PUSH_DELEG = 7, + OP_CB_RECALL_ANY = 8, + OP_CB_RECALLABLE_OBJ_AVAIL = 9, + OP_CB_RECALL_SLOT = 10, + OP_CB_SEQUENCE = 11, + OP_CB_WANTS_CANCELLED = 12, + OP_CB_NOTIFY_LOCK = 13, + OP_CB_NOTIFY_DEVICEID = 14, + + /* Callback operations new to NFSv4.2 */ + OP_CB_OFFLOAD = 15, + + OP_CB_ILLEGAL = 10044, +}; + #endif -- cgit v1.2.3 From b541dd554bc0442f7ff8c6cab6c5460c044913c8 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 21 Nov 2023 11:40:13 -0500 Subject: svcrdma: Eliminate allocation of recv_ctxt objects in backchannel The svc_rdma_recv_ctxt free list uses a lockless list to avoid the need for a spin lock in the fast path. llist_del_first(), which is used by svc_rdma_recv_ctxt_get(), requires serialization, however, when there are multiple list producers that are unserialized. I mistakenly thought there was only one caller of svc_rdma_recv_ctxt_get() (svc_rdma_refresh_recvs()), thus explicit serialization would not be necessary. But there is another caller: svc_rdma_bc_sendto(), and these two are not serialized against each other. I haven't seen ill effects that I could directly ascribe to a lack of serialization. It's just an observation based on code audit. When DMA-mapping before sending a Reply, the passed-in struct svc_rdma_recv_ctxt is used only for its write and reply PCLs. These are currently always empty in the backchannel case. So, instead of passing a full svc_rdma_recv_ctxt object to svc_rdma_map_reply_msg(), let's pass in just the Write and Reply PCLs. This change makes it unnecessary for the backchannel to acquire a dummy svc_rdma_recv_ctxt object when sending an RPC Call. The need for svc_rdma_recv_ctxt free list serialization is now completely avoided. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index a5ee0af2a3103..4ac32895a0580 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -200,7 +200,8 @@ extern int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt); extern int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, - const struct svc_rdma_recv_ctxt *rctxt, + const struct svc_rdma_pcl *write_pcl, + const struct svc_rdma_pcl *reply_pcl, const struct xdr_buf *xdr); extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, -- cgit v1.2.3 From 9c7e1a06588ee6962afe0dfe5a398e1d23212005 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 21 Nov 2023 11:40:26 -0500 Subject: svcrdma: Add a utility workqueue to svcrdma To handle work in the background, set up an UNBOUND workqueue for svcrdma. Subsequent patches will make use of it. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 4ac32895a0580..e18c94e816b3b 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -65,6 +65,7 @@ extern unsigned int svcrdma_ord; extern unsigned int svcrdma_max_requests; extern unsigned int svcrdma_max_bc_requests; extern unsigned int svcrdma_max_req_size; +extern struct workqueue_struct *svcrdma_wq; extern struct percpu_counter svcrdma_stat_read; extern struct percpu_counter svcrdma_stat_recv; -- cgit v1.2.3 From ae225fe27b931de89b6b1e1bbe6de4de23000850 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 21 Nov 2023 11:40:33 -0500 Subject: svcrdma: Add an async version of svc_rdma_send_ctxt_put() DMA unmapping can take quite some time, so it should not be handled in a single-threaded completion handler. Defer releasing send_ctxts to the recently-added workqueue. With this patch, DMA unmapping can be handled in parallel, and it does not cause head-of-queue blocking of Send completions. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index e18c94e816b3b..ab250017b99f3 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -152,7 +152,9 @@ struct svc_rdma_recv_ctxt { struct svc_rdma_send_ctxt { struct llist_node sc_node; struct rpc_rdma_cid sc_cid; + struct work_struct sc_work; + struct svcxprt_rdma *sc_rdma; struct ib_send_wr sc_send_wr; struct ib_cqe sc_cqe; struct xdr_buf sc_hdrbuf; -- cgit v1.2.3 From 6a04a4349330c5476adf465159a7f49411091bbe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:57:09 -0500 Subject: svcrdma: Move struct svc_rdma_chunk_ctxt to svc_rdma.h Prepare for nestling these into the send and recv ctxts so they no longer have to be allocated dynamically. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index ab250017b99f3..50c4f18a9b7fe 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -127,6 +127,21 @@ enum { #define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD +/* + * A chunk context tracks all I/O for moving one Read or Write + * chunk. This is a set of rdma_rw's that handle data movement + * for all segments of one chunk. + */ +struct svc_rdma_chunk_ctxt { + struct rpc_rdma_cid cc_cid; + struct ib_cqe cc_cqe; + struct list_head cc_rwctxts; + ktime_t cc_posttime; + int cc_sqecount; + enum ib_wc_status cc_status; + struct completion cc_done; +}; + struct svc_rdma_recv_ctxt { struct llist_node rc_node; struct list_head rc_list; -- cgit v1.2.3 From b1818412d06fc03605d02dbdd4a7c53dc9e2d5ba Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:57:16 -0500 Subject: svcrdma: Start moving fields out of struct svc_rdma_read_info Since the request's svc_rdma_recv_ctxt will stay around for the duration of the RDMA Read operation, the contents of struct svc_rdma_read_info can reside in the request's svc_rdma_recv_ctxt rather than being allocated separately. This will eventually save a call to kmalloc() in a hot path. Start this clean-up by moving the Read chunk's svc_rdma_chunk_ctxt. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 50c4f18a9b7fe..6c7501ae4e293 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -156,6 +156,10 @@ struct svc_rdma_recv_ctxt { u32 rc_inv_rkey; __be32 rc_msgtype; + /* State for pulling a Read chunk */ + unsigned int rc_readbytes; + struct svc_rdma_chunk_ctxt rc_cc; + struct svc_rdma_pcl rc_call_pcl; struct svc_rdma_pcl rc_read_pcl; -- cgit v1.2.3 From 8e122582680c6f8acd686a5a2af9c0e46fe90f2d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:57:22 -0500 Subject: svcrdma: Move svc_rdma_read_info::ri_pageno to struct svc_rdma_recv_ctxt Further clean up: move the page index field into svc_rdma_recv_ctxt. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 6c7501ae4e293..0ea66f73bec22 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -157,6 +157,7 @@ struct svc_rdma_recv_ctxt { __be32 rc_msgtype; /* State for pulling a Read chunk */ + unsigned int rc_curpage; unsigned int rc_readbytes; struct svc_rdma_chunk_ctxt rc_cc; -- cgit v1.2.3 From 919f6e790ab6cca772fa60c6006162c0a7ebbfc5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:57:28 -0500 Subject: svcrdma: Move read_info::ri_pageoff into struct svc_rdma_recv_ctxt Further clean up: move the starting byte offset field into svc_rdma_recv_ctxt. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 0ea66f73bec22..44a14eaf8c40a 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -157,6 +157,7 @@ struct svc_rdma_recv_ctxt { __be32 rc_msgtype; /* State for pulling a Read chunk */ + unsigned int rc_pageoff; unsigned int rc_curpage; unsigned int rc_readbytes; struct svc_rdma_chunk_ctxt rc_cc; -- cgit v1.2.3 From fc20f19b4df4a46d1003d15d84148a117e8bdf5d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:57:35 -0500 Subject: svcrdma: Update synopsis of svc_rdma_build_read_segment() Since the RDMA Read I/O state is now contained in the recv_ctxt, svc_rdma_build_read_segment() can use the recv_ctxt to derive that information rather than the other way around. This removes one usage of the ri_readctxt field, enabling its removal in a subsequent patch. At the same time, the use of ri_rqst can similarly be replaced with a passed-in function parameter. Start with build_read_segment() because it is a common utility function at the bottom of the Read chunk path. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 44a14eaf8c40a..f03f9909fb976 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -116,6 +116,13 @@ struct svcxprt_rdma { /* sc_flags */ #define RDMAXPRT_CONN_PENDING 3 +static inline struct svcxprt_rdma *svc_rdma_rqst_rdma(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + + return container_of(xprt, struct svcxprt_rdma, sc_xprt); +} + /* * Default connection parameters */ -- cgit v1.2.3 From 018f34051bc9f4908336b3fe9e52931bb8410ced Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 4 Dec 2023 09:58:33 -0500 Subject: svcrdma: Move the svc_rdma_cc_init() call Now that the chunk_ctxt for Reads is no longer dynamically allocated it can be initialized once for the life of the object that contains it (struct svc_rdma_recv_ctxt). Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index f03f9909fb976..051fefde8d518 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -211,6 +211,8 @@ extern int svc_rdma_recvfrom(struct svc_rqst *); /* svc_rdma_rw.c */ extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma); +extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc); extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, const struct svc_rdma_chunk *chunk, const struct xdr_buf *xdr); -- cgit v1.2.3 From 28ee0ec8948ac235327a1f5472fc032b308284a3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 11 Dec 2023 10:24:08 -0500 Subject: svcrdma: De-duplicate completion ID initialization helpers Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 051fefde8d518..46f2ce9f810b6 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -134,6 +134,30 @@ enum { #define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD +/** + * svc_rdma_send_cid_init - Initialize a Receive Queue completion ID + * @rdma: controlling transport + * @cid: completion ID to initialize + */ +static inline void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma, + struct rpc_rdma_cid *cid) +{ + cid->ci_queue_id = rdma->sc_rq_cq->res.id; + cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); +} + +/** + * svc_rdma_send_cid_init - Initialize a Send Queue completion ID + * @rdma: controlling transport + * @cid: completion ID to initialize + */ +static inline void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma, + struct rpc_rdma_cid *cid) +{ + cid->ci_queue_id = rdma->sc_sq_cq->res.id; + cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); +} + /* * A chunk context tracks all I/O for moving one Read or Write * chunk. This is a set of rdma_rw's that handle data movement -- cgit v1.2.3 From 4d9d69db898d05bd063548eee65d16a020676fec Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 18 Dec 2023 17:31:48 -0500 Subject: svcrdma: Add back svc_rdma_recv_ctxt::rc_pages Having an nfsd thread waiting for an RDMA Read completion is problematic if the Read responder (the client) stops responding. We need to go back to handling RDMA Reads by allowing the nfsd thread to return to the svc scheduler, then waking a second thread finish the RPC message once the Read completion fires. To start with, restore the rc_pages field so that RDMA Read pages can be managed across calls to svc_rdma_recvfrom(). Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 46f2ce9f810b6..0f2d7f68ef5d8 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -183,7 +183,6 @@ struct svc_rdma_recv_ctxt { void *rc_recv_buf; struct xdr_stream rc_stream; u32 rc_byte_len; - unsigned int rc_page_count; u32 rc_inv_rkey; __be32 rc_msgtype; @@ -199,6 +198,9 @@ struct svc_rdma_recv_ctxt { struct svc_rdma_chunk *rc_cur_result_payload; struct svc_rdma_pcl rc_write_pcl; struct svc_rdma_pcl rc_reply_pcl; + + unsigned int rc_page_count; + struct page *rc_pages[RPCSVC_MAXPAGES]; }; struct svc_rdma_send_ctxt { -- cgit v1.2.3 From a937693a82fd2211c5e52b638959d1486a77d16a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 18 Dec 2023 17:31:54 -0500 Subject: svcrdma: Add back svcxprt_rdma::sc_read_complete_q Having an nfsd thread waiting for an RDMA Read completion is problematic if the Read responder (ie, the client) stops responding. We need to go back to handling RDMA Reads by allowing the nfsd thread to return to the svc scheduler, then waking a second thread finish the RPC message once the Read completion fires. As a next step, add a list_head upon which completed Reads are queued. A subsequent patch will make use of this queue. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 0f2d7f68ef5d8..c98d29e51b9cb 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -98,6 +98,7 @@ struct svcxprt_rdma { u32 sc_pending_recvs; u32 sc_recv_batch; struct list_head sc_rq_dto_q; + struct list_head sc_read_complete_q; spinlock_t sc_rq_dto_lock; struct ib_qp *sc_qp; struct ib_cq *sc_rq_cq; -- cgit v1.2.3 From d3dba534100d4e9eb7a5204be97cd6f9ada2066e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 18 Dec 2023 17:32:07 -0500 Subject: svcrdma: Implement multi-stage Read completion again Having an nfsd thread waiting for an RDMA Read completion is problematic if the Read responder (ie, the client) stops responding. We need to go back to handling RDMA Reads by getting the svc scheduler to call svc_rdma_recvfrom() a second time to finish building an RPC message after a Read completion. This is the final patch, and makes several changes that have to happen concurrently: 1. svc_rdma_process_read_list no longer waits for a completion, but simply builds and posts the Read WRs. 2. svc_rdma_read_done() now queues a completed Read on sc_read_complete_q for later processing rather than calling complete(). 3. The completed RPC message is no longer built in the svc_rdma_process_read_list() path. Finishing the message is now done in svc_rdma_recvfrom() when it notices work on the sc_read_complete_q. The "finish building this RPC message" code is removed from the svc_rdma_process_read_list() path. This arrangement avoids the need for an nfsd thread to wait for an RDMA Read non-interruptibly without a timeout. It's basically the same code structure that Tom Tucker used for Read chunks along with some clean-up and modernization. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index c98d29e51b9cb..e7595ae62fe29 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -170,8 +170,6 @@ struct svc_rdma_chunk_ctxt { struct list_head cc_rwctxts; ktime_t cc_posttime; int cc_sqecount; - enum ib_wc_status cc_status; - struct completion cc_done; }; struct svc_rdma_recv_ctxt { @@ -191,6 +189,7 @@ struct svc_rdma_recv_ctxt { unsigned int rc_pageoff; unsigned int rc_curpage; unsigned int rc_readbytes; + struct xdr_buf rc_saved_arg; struct svc_rdma_chunk_ctxt rc_cc; struct svc_rdma_pcl rc_call_pcl; @@ -240,6 +239,9 @@ extern int svc_rdma_recvfrom(struct svc_rqst *); extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma); extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc); +extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc, + enum dma_data_direction dir); extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, const struct svc_rdma_chunk *chunk, const struct xdr_buf *xdr); -- cgit v1.2.3 From 7b207ccd983350a5dedd132b57c666186dd02a7c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 15 Dec 2023 11:56:32 +1100 Subject: svc: don't hold reference for poolstats, only mutex. A future patch will remove refcounting on svc_serv as it is of little use. It is currently used to keep the svc around while the pool_stats file is open. Change this to get the pointer, protected by the mutex, only in seq_start, and the release the mutex in seq_stop. This means that if the nfsd server is stopped and restarted while the pool_stats file it open, then some pool stats info could be from the first instance and some from the second. This might appear odd, but is unlikely to be a problem in practice. Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 544fcfe074799..3bea2840272d3 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -97,6 +97,12 @@ struct svc_serv { #endif /* CONFIG_SUNRPC_BACKCHANNEL */ }; +/* This is used by pool_stats to find and lock an svc */ +struct svc_info { + struct svc_serv *serv; + struct mutex *mutex; +}; + /** * svc_get() - increment reference count on a SUNRPC serv * @serv: the svc_serv to have count incremented @@ -431,7 +437,7 @@ void svc_exit_thread(struct svc_rqst *); struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, int (*threadfn)(void *data)); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); -int svc_pool_stats_open(struct svc_serv *serv, struct file *file); +int svc_pool_stats_open(struct svc_info *si, struct file *file); void svc_process(struct svc_rqst *rqstp); void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp); int svc_register(const struct svc_serv *, struct net *, const int, -- cgit v1.2.3 From 1e3577a4521ef33199eea05ce7b9099825848c49 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 15 Dec 2023 11:56:34 +1100 Subject: SUNRPC: discard sv_refcnt, and svc_get/svc_put sv_refcnt is no longer useful. lockd and nfs-cb only ever have the svc active when there are a non-zero number of threads, so sv_refcnt mirrors sv_nrthreads. nfsd also keeps the svc active between when a socket is added and when the first thread is started, but we don't really need a refcount for that. We can simply not destroy the svc while there are any permanent sockets attached. So remove sv_refcnt and the get/put functions. Instead of a final call to svc_put(), call svc_destroy() instead. This is changed to also store NULL in the passed-in pointer to make it easier to avoid use-after-free situations. Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 27 +-------------------------- 1 file changed, 1 insertion(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 3bea2840272d3..8d7888234e9e4 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -69,7 +69,6 @@ struct svc_serv { struct svc_program * sv_program; /* RPC program */ struct svc_stat * sv_stats; /* RPC statistics */ spinlock_t sv_lock; - struct kref sv_refcnt; unsigned int sv_nrthreads; /* # of server threads */ unsigned int sv_maxconn; /* max connections allowed or * '0' causing max to be based @@ -103,31 +102,7 @@ struct svc_info { struct mutex *mutex; }; -/** - * svc_get() - increment reference count on a SUNRPC serv - * @serv: the svc_serv to have count incremented - * - * Returns: the svc_serv that was passed in. - */ -static inline struct svc_serv *svc_get(struct svc_serv *serv) -{ - kref_get(&serv->sv_refcnt); - return serv; -} - -void svc_destroy(struct kref *); - -/** - * svc_put - decrement reference count on a SUNRPC serv - * @serv: the svc_serv to have count decremented - * - * When the reference count reaches zero, svc_destroy() - * is called to clean up and free the serv. - */ -static inline void svc_put(struct svc_serv *serv) -{ - kref_put(&serv->sv_refcnt, svc_destroy); -} +void svc_destroy(struct svc_serv **svcp); /* * Maximum payload size supported by a kernel RPC server. -- cgit v1.2.3 From 317bacf960a4879af22d12175f47d284930b3273 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Fri, 1 Dec 2023 17:25:27 -0500 Subject: i3c: master: add enable(disable) hot join in sys entry Add hotjoin entry in sys file system allow user enable/disable hotjoin feature. Add (*enable(disable)_hotjoin)() to i3c_master_controller_ops. Add api i3c_master_enable(disable)_hotjoin(); Signed-off-by: Frank Li Link: https://lore.kernel.org/r/20231201222532.2431484-2-Frank.Li@nxp.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/master.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index 24c1863b86e2b..3b5bd8e3257c1 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -460,6 +460,8 @@ struct i3c_master_controller_ops { int (*disable_ibi)(struct i3c_dev_desc *dev); void (*recycle_ibi_slot)(struct i3c_dev_desc *dev, struct i3c_ibi_slot *slot); + int (*enable_hotjoin)(struct i3c_master_controller *master); + int (*disable_hotjoin)(struct i3c_master_controller *master); }; /** @@ -495,6 +497,7 @@ struct i3c_master_controller { const struct i3c_master_controller_ops *ops; unsigned int secondary : 1; unsigned int init_done : 1; + unsigned int hotjoin: 1; struct { struct list_head i3c; struct list_head i2c; @@ -551,6 +554,8 @@ int i3c_master_register(struct i3c_master_controller *master, const struct i3c_master_controller_ops *ops, bool secondary); void i3c_master_unregister(struct i3c_master_controller *master); +int i3c_master_enable_hotjoin(struct i3c_master_controller *master); +int i3c_master_disable_hotjoin(struct i3c_master_controller *master); /** * i3c_dev_get_master_data() - get master private data attached to an I3C -- cgit v1.2.3 From e5e3df06ac98d15cfb10bb5c12356709365e91b2 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Fri, 1 Dec 2023 17:25:29 -0500 Subject: i3c: add actual_len in i3c_priv_xfer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In MIPI I3C Specification: "Ninth Bit of SDR Target Returned (Read) Data as End-of-Data: In I2C, the ninth Data bit from Target to Controller is an ACK by the Controller. By contrast, in I3C this bit allows the Target to end a Read, and allows the Controller to Abort a Read. In SDR terms, the ninth bit of Read data is referred to as the T-Bit (for ‘Transition’)" I3C allow devices early terminate data transfer. So need "actual_len" field to indicate how much get by i3c_priv_xfer. Reviewed-by: Miquel Raynal Signed-off-by: Frank Li Link: https://lore.kernel.org/r/20231201222532.2431484-4-Frank.Li@nxp.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/device.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h index 84ed77c049400..e119f11948efe 100644 --- a/include/linux/i3c/device.h +++ b/include/linux/i3c/device.h @@ -54,6 +54,7 @@ enum i3c_hdr_mode { * struct i3c_priv_xfer - I3C SDR private transfer * @rnw: encodes the transfer direction. true for a read, false for a write * @len: transfer length in bytes of the transfer + * @actual_len: actual length in bytes are transferred by the controller * @data: input/output buffer * @data.in: input buffer. Must point to a DMA-able buffer * @data.out: output buffer. Must point to a DMA-able buffer @@ -62,6 +63,7 @@ enum i3c_hdr_mode { struct i3c_priv_xfer { u8 rnw; u16 len; + u16 actual_len; union { void *in; const void *out; -- cgit v1.2.3 From 18e5794879905a788e06fb2bc40b6f5b58eae5c2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 22 Dec 2023 21:05:42 -0800 Subject: i3c: master: fix Excess kernel-doc description warning Remove the @boardinfo: line to prevent the kernel-doc warning: include/linux/i3c/master.h:98: warning: Excess struct member 'boardinfo' description in 'i2c_dev_desc' Signed-off-by: Randy Dunlap Cc: Alexandre Belloni Cc: Link: https://lore.kernel.org/r/20231223050542.13930-1-rdunlap@infradead.org Signed-off-by: Alexandre Belloni --- include/linux/i3c/master.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index 3b5bd8e3257c1..1ecd73b17ff5d 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -76,7 +76,6 @@ struct i2c_dev_boardinfo { /** * struct i2c_dev_desc - I2C device descriptor * @common: common part of the I2C device descriptor - * @boardinfo: pointer to the boardinfo attached to this I2C device * @dev: I2C device object registered to the I2C framework * @addr: I2C device address * @lvr: LVR (Legacy Virtual Register) needed by the I3C core to know about -- cgit v1.2.3 From 10416a3578ba5f76d0b161d2d36a1d8a4c46a69d Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 19 Dec 2023 15:59:32 +0100 Subject: firewire: make fw_bus_type const Now that the driver core can properly handle constant struct bus_type, move the fw_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Takashi Sakamoto Cc: linux1394-devel@lists.sourceforge.net Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/2023121931-skydiver-dodgy-d1bd@gregkh Signed-off-by: Takashi Sakamoto --- include/linux/firewire.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/firewire.h b/include/linux/firewire.h index bd3fc75d4f146..dd9f2d765e68b 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -75,7 +75,7 @@ void fw_csr_iterator_init(struct fw_csr_iterator *ci, const u32 *p); int fw_csr_iterator_next(struct fw_csr_iterator *ci, int *key, int *value); int fw_csr_string(const u32 *directory, int key, char *buf, size_t size); -extern struct bus_type fw_bus_type; +extern const struct bus_type fw_bus_type; struct fw_card_driver; struct fw_node; -- cgit v1.2.3 From e9ee910218ffd420454b52a052d6f1087354905b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 7 Jan 2024 17:11:38 -0800 Subject: Revert "net: stmmac: Enable Per DMA Channel interrupt" Revert "net: stmmac: Use interrupt mode INTM=1 for per channel irq" This reverts commit 36af9f25ddfd311da82628f194c794786467cb12. Revert "net: stmmac: Add support for TX/RX channel interrupt" This reverts commit 9072e03d32088137a435ddf3aa95fd6e038d69d8. Revert "net: stmmac: Make MSI interrupt routine generic" This reverts commit 477bd4beb93bf9ace9bda71f1437b191befa9cf4. Revert "dt-bindings: net: snps,dwmac: per channel irq" This reverts commit 67d47c8ada0f8795bfcdb85cc8f2ad3ce556674b. Device tree bindings need to be reviewed. Link: https://lore.kernel.org/all/2df9fe3e-7971-4aa2-89a9-0e085b3b00d7@linaro.org/ Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index b950e6f9761de..dee5ad6e48c5a 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -98,7 +98,7 @@ struct stmmac_dma_cfg { int mixed_burst; bool aal; bool eame; - bool multi_irq_en; + bool multi_msi_en; bool dche; }; @@ -215,7 +215,7 @@ struct dwmac4_addrs { #define STMMAC_FLAG_TSO_EN BIT(4) #define STMMAC_FLAG_SERDES_UP_AFTER_PHY_LINKUP BIT(5) #define STMMAC_FLAG_VLAN_FAIL_Q_EN BIT(6) -#define STMMAC_FLAG_MULTI_IRQ_EN BIT(7) +#define STMMAC_FLAG_MULTI_MSI_EN BIT(7) #define STMMAC_FLAG_EXT_SNAPSHOT_EN BIT(8) #define STMMAC_FLAG_INT_SNAPSHOT_EN BIT(9) #define STMMAC_FLAG_RX_CLK_RUNS_IN_LPI BIT(10) -- cgit v1.2.3 From 3fbf61207c66ff7ac9b60ab76d4bfd239f97e973 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 7 Jan 2024 17:14:51 -0800 Subject: Revert "mlx5 updates 2023-12-20" Revert "net/mlx5: Implement management PF Ethernet profile" This reverts commit 22c4640698a1d47606b5a4264a584e8046641784. Revert "net/mlx5: Enable SD feature" This reverts commit c88c49ac9c18fb7c3fa431126de1d8f8f555e912. Revert "net/mlx5e: Block TLS device offload on combined SD netdev" This reverts commit 83a59ce0057b7753d7fbece194b89622c663b2a6. Revert "net/mlx5e: Support per-mdev queue counter" This reverts commit d72baceb92539a178d2610b0e9ceb75706a75b55. Revert "net/mlx5e: Support cross-vhca RSS" This reverts commit c73a3ab8fa6e93a783bd563938d7cf00d62d5d34. Revert "net/mlx5e: Let channels be SD-aware" This reverts commit e4f9686bdee7b4dd89e0ed63cd03606e4bda4ced. Revert "net/mlx5e: Create EN core HW resources for all secondary devices" This reverts commit c4fb94aa822d6c9d05fc3c5aee35c7e339061dc1. Revert "net/mlx5e: Create single netdev per SD group" This reverts commit e2578b4f983cfcd47837bbe3bcdbf5920e50b2ad. Revert "net/mlx5: SD, Add informative prints in kernel log" This reverts commit c82d360325112ccc512fc11a3b68cdcdf04a1478. Revert "net/mlx5: SD, Implement steering for primary and secondaries" This reverts commit 605fcce33b2d1beb0139b6e5913fa0b2062116b2. Revert "net/mlx5: SD, Implement devcom communication and primary election" This reverts commit a45af9a96740873db9a4b5bb493ce2ad81ccb4d5. Revert "net/mlx5: SD, Implement basic query and instantiation" This reverts commit 63b9ce944c0e26c44c42cdd5095c2e9851c1a8ff. Revert "net/mlx5: SD, Introduce SD lib" This reverts commit 4a04a31f49320d078b8078e1da4b0e2faca5dfa3. Revert "net/mlx5: Fix query of sd_group field" This reverts commit e04984a37398b3f4f5a79c993b94c6b1224184cc. Revert "net/mlx5e: Use the correct lag ports number when creating TISes" This reverts commit a7e7b40c4bc115dbf2a2bb453d7bbb2e0ea99703. There are some unanswered questions on the list, and we don't have any docs. Given the lack of replies so far and the fact that v6.8 merge window has started - let's revert this and revisit for v6.9. Link: https://lore.kernel.org/all/20231221005721.186607-1-saeed@kernel.org/ Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 10 ---------- include/linux/mlx5/mlx5_ifc.h | 24 ++++++------------------ include/linux/mlx5/vport.h | 1 - 3 files changed, 6 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 2bba88c67f583..7ee5b79ff3d60 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -681,7 +681,6 @@ struct mlx5e_resources { struct mlx5_sq_bfreg bfreg; #define MLX5_MAX_NUM_TC 8 u32 tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC]; - bool tisn_valid; } hw_objs; struct net_device *uplink_netdev; struct mutex uplink_netdev_lock; @@ -822,7 +821,6 @@ struct mlx5_core_dev { struct blocking_notifier_head macsec_nh; #endif u64 num_ipsec_offloads; - struct mlx5_sd *sd; }; struct mlx5_db { @@ -1224,14 +1222,6 @@ static inline bool mlx5_core_is_ecpf(const struct mlx5_core_dev *dev) return dev->caps.embedded_cpu; } -static inline bool mlx5_core_is_mgmt_pf(const struct mlx5_core_dev *dev) -{ - if (!MLX5_CAP_GEN_2(dev, local_mng_port_valid)) - return false; - - return MLX5_CAP_GEN_2(dev, local_mng_port); -} - static inline bool mlx5_core_is_ecpf_esw_manager(const struct mlx5_core_dev *dev) { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5865692092544..fee20fc010c2d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1954,10 +1954,8 @@ enum { struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_0[0x80]; - u8 migratable[0x1]; - u8 reserved_at_81[0x19]; - u8 local_mng_port[0x1]; - u8 reserved_at_9b[0x5]; + u8 migratable[0x1]; + u8 reserved_at_81[0x1f]; u8 max_reformat_insert_size[0x8]; u8 max_reformat_insert_offset[0x8]; @@ -1975,13 +1973,7 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 allowed_object_for_other_vhca_access[0x40]; - u8 reserved_at_140[0x20]; - - u8 reserved_at_160[0xa]; - u8 local_mng_port_valid[0x1]; - u8 reserved_at_16b[0x15]; - - u8 reserved_at_180[0x20]; + u8 reserved_at_140[0x60]; u8 flow_table_type_2_type[0x8]; u8 reserved_at_1a8[0x3]; @@ -4038,13 +4030,8 @@ struct mlx5_ifc_nic_vport_context_bits { u8 affiliation_criteria[0x4]; u8 affiliated_vhca_id[0x10]; - u8 reserved_at_60[0xa0]; + u8 reserved_at_60[0xd0]; - u8 reserved_at_100[0x1]; - u8 sd_group[0x3]; - u8 reserved_at_104[0x1c]; - - u8 reserved_at_120[0x10]; u8 mtu[0x10]; u8 system_image_guid[0x40]; @@ -10129,7 +10116,8 @@ struct mlx5_ifc_mpir_reg_bits { u8 reserved_at_20[0x20]; u8 local_port[0x8]; - u8 reserved_at_28[0x18]; + u8 reserved_at_28[0x15]; + u8 sd_group[0x3]; u8 reserved_at_60[0x20]; }; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index c36cc6d829267..fbb9bf4478894 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -72,7 +72,6 @@ int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu); int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu); int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev, u64 *system_image_guid); -int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group); int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid); int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev, u16 vport, u64 node_guid); -- cgit v1.2.3 From 4e33b071bb8e8415fb9847249ffcf300fa7d8cac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Dec 2023 07:51:41 +0000 Subject: block: remove disk_clear_zoned disk_clear_zoned is unused now that the last warts of the host-aware model support in sd are gone. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20231228075141.362560-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 94701a63ad8aa..e1e705aef51ee 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -318,7 +318,6 @@ typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, void *data); void disk_set_zoned(struct gendisk *disk); -void disk_clear_zoned(struct gendisk *disk); #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, -- cgit v1.2.3 From 3b7cb745473aec7255d66e3854abaa9c3f46f952 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 8 Jan 2024 11:50:16 -0700 Subject: block: move __get_task_ioprio() into header file We call this once per IO, which can be millions of times per second. Since nobody really uses io priorities, or at least it isn't very common, this is all wasted time and can amount to as much as 3% of the total kernel time. Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/ioprio.h | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index 7578d4f6a969a..d6a9b5b7ed167 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -47,7 +47,30 @@ static inline int task_nice_ioclass(struct task_struct *task) } #ifdef CONFIG_BLOCK -int __get_task_ioprio(struct task_struct *p); +/* + * If the task has set an I/O priority, use that. Otherwise, return + * the default I/O priority. + * + * Expected to be called for current task or with task_lock() held to keep + * io_context stable. + */ +static inline int __get_task_ioprio(struct task_struct *p) +{ + struct io_context *ioc = p->io_context; + int prio; + + if (p != current) + lockdep_assert_held(&p->alloc_lock); + if (ioc) + prio = ioc->ioprio; + else + prio = IOPRIO_DEFAULT; + + if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE) + prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p), + task_nice_ioprio(p)); + return prio; +} #else static inline int __get_task_ioprio(struct task_struct *p) { -- cgit v1.2.3 From 53889bcaf536b3abedeaf104019877cee37dd08b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 8 Jan 2024 11:51:57 -0700 Subject: block: make __get_task_ioprio() easier to read We don't need to do any gymnastics if we don't have an io_context assigned at all, so just return early with our default priority. Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/ioprio.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index d6a9b5b7ed167..db1249cd96920 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -59,13 +59,13 @@ static inline int __get_task_ioprio(struct task_struct *p) struct io_context *ioc = p->io_context; int prio; + if (!ioc) + return IOPRIO_DEFAULT; + if (p != current) lockdep_assert_held(&p->alloc_lock); - if (ioc) - prio = ioc->ioprio; - else - prio = IOPRIO_DEFAULT; + prio = ioc->ioprio; if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE) prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p), task_nice_ioprio(p)); -- cgit v1.2.3 From fd37721803c6e73619108f76ad2e12a9aa5fafaf Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 28 Dec 2023 17:47:03 +0300 Subject: mm, treewide: introduce NR_PAGE_ORDERS NR_PAGE_ORDERS defines the number of page orders supported by the page allocator, ranging from 0 to MAX_ORDER, MAX_ORDER + 1 in total. NR_PAGE_ORDERS assists in defining arrays of page orders and allows for more natural iteration over them. [kirill.shutemov@linux.intel.com: fixup for kerneldoc warning] Link: https://lkml.kernel.org/r/20240101111512.7empzyifq7kxtzk3@box Link: https://lkml.kernel.org/r/20231228144704.14033-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reviewed-by: Zi Yan Cc: Linus Torvalds Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c18c53353b50f..1ea7636dfb765 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -35,6 +35,8 @@ #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES) +#define NR_PAGE_ORDERS (MAX_ORDER + 1) + /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * costly to service. That is between allocation orders which should @@ -96,7 +98,7 @@ static inline bool migratetype_is_mergeable(int mt) } #define for_each_migratetype_order(order, type) \ - for (order = 0; order <= MAX_ORDER; order++) \ + for (order = 0; order < NR_PAGE_ORDERS; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) extern int page_group_by_mobility_disabled; @@ -933,7 +935,7 @@ struct zone { CACHELINE_PADDING(_pad1_); /* free areas of different sizes */ - struct free_area free_area[MAX_ORDER + 1]; + struct free_area free_area[NR_PAGE_ORDERS]; #ifdef CONFIG_UNACCEPTED_MEMORY /* Pages to be accepted. All pages on the list are MAX_ORDER */ -- cgit v1.2.3 From 5e0a760b44417f7cadd79de2204d6247109558a0 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 28 Dec 2023 17:47:04 +0300 Subject: mm, treewide: rename MAX_ORDER to MAX_PAGE_ORDER commit 23baf831a32c ("mm, treewide: redefine MAX_ORDER sanely") has changed the definition of MAX_ORDER to be inclusive. This has caused issues with code that was not yet upstream and depended on the previous definition. To draw attention to the altered meaning of the define, rename MAX_ORDER to MAX_PAGE_ORDER. Link: https://lkml.kernel.org/r/20231228144704.14033-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Cc: Linus Torvalds Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 2 +- include/linux/mmzone.h | 14 +++++++------- include/linux/pageblock-flags.h | 4 ++-- include/linux/slab.h | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 236ec7b63c541..c1ee640d87b11 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -829,7 +829,7 @@ static inline unsigned huge_page_shift(struct hstate *h) static inline bool hstate_is_gigantic(struct hstate *h) { - return huge_page_order(h) > MAX_ORDER; + return huge_page_order(h) > MAX_PAGE_ORDER; } static inline unsigned int pages_per_huge_page(const struct hstate *h) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1ea7636dfb765..4ed33b1278215 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -27,15 +27,15 @@ /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_ARCH_FORCE_MAX_ORDER -#define MAX_ORDER 10 +#define MAX_PAGE_ORDER 10 #else -#define MAX_ORDER CONFIG_ARCH_FORCE_MAX_ORDER +#define MAX_PAGE_ORDER CONFIG_ARCH_FORCE_MAX_ORDER #endif -#define MAX_ORDER_NR_PAGES (1 << MAX_ORDER) +#define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER) #define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES) -#define NR_PAGE_ORDERS (MAX_ORDER + 1) +#define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1) /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed @@ -938,7 +938,7 @@ struct zone { struct free_area free_area[NR_PAGE_ORDERS]; #ifdef CONFIG_UNACCEPTED_MEMORY - /* Pages to be accepted. All pages on the list are MAX_ORDER */ + /* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */ struct list_head unaccepted_pages; #endif @@ -1748,8 +1748,8 @@ static inline bool movable_only_nodes(nodemask_t *nodes) #define SECTION_BLOCKFLAGS_BITS \ ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) -#if (MAX_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS -#error Allocator MAX_ORDER exceeds SECTION_SIZE +#if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS +#error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE #endif static inline unsigned long pfn_to_section_nr(unsigned long pfn) diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index e83c4c0950417..3f2409b968ec6 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -41,14 +41,14 @@ extern unsigned int pageblock_order; * Huge pages are a constant size, but don't exceed the maximum allocation * granularity. */ -#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER) +#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ #else /* CONFIG_HUGETLB_PAGE */ /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ -#define pageblock_order MAX_ORDER +#define pageblock_order MAX_PAGE_ORDER #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/include/linux/slab.h b/include/linux/slab.h index d6d6ffeeb9a2a..d63823e518c05 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -308,7 +308,7 @@ static inline unsigned int arch_slab_minalign(void) * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) +#define KMALLOC_SHIFT_MAX (MAX_PAGE_ORDER + PAGE_SHIFT) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 5 #endif @@ -316,7 +316,7 @@ static inline unsigned int arch_slab_minalign(void) #ifdef CONFIG_SLUB #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) +#define KMALLOC_SHIFT_MAX (MAX_PAGE_ORDER + PAGE_SHIFT) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif -- cgit v1.2.3 From 2fb7e4dd35c52933b18ff127bf92d703c8e2e897 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 5 Jan 2024 13:51:21 +0100 Subject: PNP: make pnp_bus_type const Now that the driver core can properly handle constant struct bus_type, move the pnp_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Signed-off-by: Greg Kroah-Hartman Signed-off-by: Rafael J. Wysocki --- include/linux/pnp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pnp.h b/include/linux/pnp.h index 267fb8a4fb6e0..ddbe7c3ca4ce2 100644 --- a/include/linux/pnp.h +++ b/include/linux/pnp.h @@ -435,7 +435,7 @@ struct pnp_protocol { #define protocol_for_each_dev(protocol, dev) \ list_for_each_entry(dev, &(protocol)->devices, protocol_list) -extern struct bus_type pnp_bus_type; +extern const struct bus_type pnp_bus_type; #if defined(CONFIG_PNP) -- cgit v1.2.3 From 6eade110754c085cee9e46f4d87d2c3ea4e59e8c Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 20 Dec 2023 16:17:32 -0800 Subject: cxl/events: Separate UUID from event structures The UEFI CXL CPER structure does not include the UUID. Now that the UUID is passed separately to the trace event there is no need to have the UUID in those structures. Move UUID from the event record header to the raw structures. Adjust cxl-test to Create dummy structures for creating test records. Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-5-1bb8a4ca2c7a@intel.com Reviewed-by: Jonathan Cameron Acked-by: Ard Biesheuvel Signed-off-by: Dan Williams --- include/linux/cxl-event.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h index 0fc068123f8ed..3d9b5954d0c11 100644 --- a/include/linux/cxl-event.h +++ b/include/linux/cxl-event.h @@ -8,7 +8,6 @@ * CXL rev 3.0 section 8.2.9.2.1; Table 8-42 */ struct cxl_event_record_hdr { - uuid_t id; u8 length; u8 flags[3]; __le16 handle; @@ -18,8 +17,13 @@ struct cxl_event_record_hdr { u8 reserved[15]; } __packed; +/* + * Common Event Record Format + * CXL rev 3.0 section 8.2.9.2.1; Table 8-42 + */ #define CXL_EVENT_RECORD_DATA_LENGTH 0x50 struct cxl_event_record_raw { + uuid_t id; struct cxl_event_record_hdr hdr; u8 data[CXL_EVENT_RECORD_DATA_LENGTH]; } __packed; -- cgit v1.2.3 From f9c683386f5bc0364615138ce2b14be50848dbcf Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 20 Dec 2023 16:17:33 -0800 Subject: cxl/events: Create a CXL event union The CXL CPER and event log records share everything but a UUID/GUID in their structures. Define a cxl_event union without the UUID/GUID to be shared between the CPER and event log record formats. Adjust the code to use this union. Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-6-1bb8a4ca2c7a@intel.com Reviewed-by: Jonathan Cameron Acked-by: Ard Biesheuvel Signed-off-by: Dan Williams --- include/linux/cxl-event.h | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h index 3d9b5954d0c11..4d6c05f535f80 100644 --- a/include/linux/cxl-event.h +++ b/include/linux/cxl-event.h @@ -17,13 +17,8 @@ struct cxl_event_record_hdr { u8 reserved[15]; } __packed; -/* - * Common Event Record Format - * CXL rev 3.0 section 8.2.9.2.1; Table 8-42 - */ #define CXL_EVENT_RECORD_DATA_LENGTH 0x50 -struct cxl_event_record_raw { - uuid_t id; +struct cxl_event_generic { struct cxl_event_record_hdr hdr; u8 data[CXL_EVENT_RECORD_DATA_LENGTH]; } __packed; @@ -96,4 +91,20 @@ struct cxl_event_mem_module { u8 reserved[0x3d]; } __packed; +union cxl_event { + struct cxl_event_generic generic; + struct cxl_event_gen_media gen_media; + struct cxl_event_dram dram; + struct cxl_event_mem_module mem_module; +} __packed; + +/* + * Common Event Record Format; in event logs + * CXL rev 3.0 section 8.2.9.2.1; Table 8-42 + */ +struct cxl_event_record_raw { + uuid_t id; + union cxl_event event; +} __packed; + #endif /* _LINUX_CXL_EVENT_H */ -- cgit v1.2.3 From 671a794c33c6e048ca5cedd5ad6af44d52d5d7e5 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 20 Dec 2023 16:17:34 -0800 Subject: acpi/ghes: Process CXL Component Events BIOS can configure memory devices as firmware first. This will send CXL events to the firmware instead of the OS. The firmware can then send these events to the OS via UEFI. UEFI v2.10 section N.2.14 defines a Common Platform Error Record (CPER) format for CXL Component Events. The format is mostly the same as the CXL Common Event Record Format. The difference is the use of a GUID in the Section Type rather than a UUID as part of the event itself. Add GHES support to detect CXL CPER records and call a registered callback with the event. A notifier chain was considered for the callback but the complexity did not justify the use case as only the CXL subsystem requires this event. Enforce that only one callback can be registered at any time. Cc: Ard Biesheuvel Cc: Rafael J. Wysocki Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-7-1bb8a4ca2c7a@intel.com [djbw: fixup checkpatch errors] Reviewed-by: Jonathan Cameron Acked-by: Ard Biesheuvel Signed-off-by: Dan Williams --- include/linux/cxl-event.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h index 4d6c05f535f80..95841750a383b 100644 --- a/include/linux/cxl-event.h +++ b/include/linux/cxl-event.h @@ -107,4 +107,54 @@ struct cxl_event_record_raw { union cxl_event event; } __packed; +enum cxl_event_type { + CXL_CPER_EVENT_GEN_MEDIA, + CXL_CPER_EVENT_DRAM, + CXL_CPER_EVENT_MEM_MODULE, +}; + +#define CPER_CXL_DEVICE_ID_VALID BIT(0) +#define CPER_CXL_DEVICE_SN_VALID BIT(1) +#define CPER_CXL_COMP_EVENT_LOG_VALID BIT(2) +struct cxl_cper_event_rec { + struct { + u32 length; + u64 validation_bits; + struct cper_cxl_event_devid { + u16 vendor_id; + u16 device_id; + u8 func_num; + u8 device_num; + u8 bus_num; + u16 segment_num; + u16 slot_num; /* bits 2:0 reserved */ + u8 reserved; + } __packed device_id; + struct cper_cxl_event_sn { + u32 lower_dw; + u32 upper_dw; + } __packed dev_serial_num; + } __packed hdr; + + union cxl_event event; +} __packed; + +typedef void (*cxl_cper_callback)(enum cxl_event_type type, + struct cxl_cper_event_rec *rec); + +#ifdef CONFIG_ACPI_APEI_GHES +int cxl_cper_register_callback(cxl_cper_callback callback); +int cxl_cper_unregister_callback(cxl_cper_callback callback); +#else +static inline int cxl_cper_register_callback(cxl_cper_callback callback) +{ + return 0; +} + +static inline int cxl_cper_unregister_callback(cxl_cper_callback callback) +{ + return 0; +} +#endif + #endif /* _LINUX_CXL_EVENT_H */ -- cgit v1.2.3 From ced085ef369af7a2b6da962ec2fbd01339f60693 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 20 Dec 2023 16:17:35 -0800 Subject: PCI: Introduce cleanup helpers for device reference counts and locks The "goto error" pattern is notorious for introducing subtle resource leaks. Use the new cleanup.h helpers for PCI device reference counts and locks. Similar to the new put_device() and device_lock() cleanup helpers, __free(put_device) and guard(device), define the same for PCI devices, __free(pci_dev_put) and guard(pci_dev). These helpers eliminate the need for "goto free;" and "goto unlock;" patterns. For example, A 'struct pci_dev *' instance declared as: struct pci_dev *pdev __free(pci_dev_put) = NULL; ...will automatically call pci_dev_put() if @pdev is non-NULL when @pdev goes out of scope (automatic variable scope). If a function wants to invoke pci_dev_put() on error, but return @pdev on success, it can do: return no_free_ptr(pdev); ...or: return_ptr(pdev); For potential cleanup opportunity there are 587 open-coded calls to pci_dev_put() in the kernel with 65 instances within 10 lines of a goto statement with the CXL driver threatening to add another one. The guard() helper holds the associated lock for the remainder of the current scope in which it was invoked. So, for example: func(...) { if (...) { ... guard(pci_dev); /* pci_dev_lock() invoked here */ ... } /* <- implied pci_dev_unlock() triggered here */ } There are 15 invocations of pci_dev_unlock() in the kernel with 5 instances within 10 lines of a goto statement. Again, the CXL driver is threatening to add another. Introduce these helpers to preclude the addition of new more error prone goto put; / goto unlock; sequences. For now, these helpers are used in drivers/cxl/pci.c to allow ACPI error reports to be fed back into the CXL driver associated with the PCI device identified in the report. Cc: Bjorn Helgaas Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-8-1bb8a4ca2c7a@intel.com [djbw: rewrite changelog] Acked-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Acked-by: Ard Biesheuvel Signed-off-by: Dan Williams --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index dea043bc1e383..0d23d2e0eb1a4 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1170,6 +1170,7 @@ int pci_get_interrupt_pin(struct pci_dev *dev, struct pci_dev **bridge); u8 pci_common_swizzle(struct pci_dev *dev, u8 *pinp); struct pci_dev *pci_dev_get(struct pci_dev *dev); void pci_dev_put(struct pci_dev *dev); +DEFINE_FREE(pci_dev_put, struct pci_dev *, if (_T) pci_dev_put(_T)) void pci_remove_bus(struct pci_bus *b); void pci_stop_and_remove_bus_device(struct pci_dev *dev); void pci_stop_and_remove_bus_device_locked(struct pci_dev *dev); @@ -1874,6 +1875,7 @@ void pci_cfg_access_unlock(struct pci_dev *dev); void pci_dev_lock(struct pci_dev *dev); int pci_dev_trylock(struct pci_dev *dev); void pci_dev_unlock(struct pci_dev *dev); +DEFINE_GUARD(pci_dev, struct pci_dev *, pci_dev_lock(_T), pci_dev_unlock(_T)) /* * PCI domain support. Sometimes called PCI segment (eg by ACPI), -- cgit v1.2.3 From dc97f6344f205b0dfa144e1b3e16d6dc05383d57 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 20 Dec 2023 16:17:36 -0800 Subject: cxl/pci: Register for and process CPER events If the firmware has configured CXL event support to be firmware first the OS can process those events through CPER records. The CXL layer has unique DPA to HPA knowledge and standard event trace parsing in place. CPER records contain Bus, Device, Function information which can be used to identify the PCI device which is sending the event. Change the PCI driver registration to include registration of a CXL CPER callback to process events through the trace subsystem. Use new scoped based management to simplify the handling of the PCI device object. Tested-by: Smita-Koralahalli Reviewed-by: Smita-Koralahalli Link: https://lore.kernel.org/r/20231220-cxl-cper-v5-9-1bb8a4ca2c7a@intel.com Signed-off-by: Ira Weiny [djbw: use new pci_dev guard, flip init order] Reviewed-by: Jonathan Cameron Acked-by: Ard Biesheuvel Signed-off-by: Dan Williams --- include/linux/cxl-event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h index 95841750a383b..91125eca4c8ab 100644 --- a/include/linux/cxl-event.h +++ b/include/linux/cxl-event.h @@ -108,6 +108,7 @@ struct cxl_event_record_raw { } __packed; enum cxl_event_type { + CXL_CPER_EVENT_GENERIC, CXL_CPER_EVENT_GEN_MEDIA, CXL_CPER_EVENT_DRAM, CXL_CPER_EVENT_MEM_MODULE, -- cgit v1.2.3 From ef067191f73cce3ee192e991ce486d95524655d5 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Mon, 25 Dec 2023 17:11:56 +0200 Subject: vdpa/mlx5: Expose resumable vq capability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Necessary for checking if resumable vqs are supported by the hardware. Actual support will be added in a downstream patch. Reviewed-by: Gal Pressman Acked-by: Eugenio Pérez Signed-off-by: Dragos Tatulea Message-Id: <20231225151203.152687-2-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- include/linux/mlx5/mlx5_ifc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6f3631425f386..9eaceaf6bcb06 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1236,7 +1236,8 @@ struct mlx5_ifc_virtio_emulation_cap_bits { u8 reserved_at_c0[0x13]; u8 desc_group_mkey_supported[0x1]; - u8 reserved_at_d4[0xc]; + u8 freeze_to_rdy_supported[0x1]; + u8 reserved_at_d5[0xb]; u8 reserved_at_e0[0x20]; -- cgit v1.2.3 From 9b23417825df470e4c9e98e7ed4b2c37465bfa1e Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Mon, 25 Dec 2023 17:11:59 +0200 Subject: vdpa/mlx5: Mark vq addrs for modification in hw vq Addresses get set by .set_vq_address. hw vq addresses will be updated on next modify_virtqueue. Reviewed-by: Gal Pressman Signed-off-by: Dragos Tatulea Message-Id: <20231225151203.152687-5-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- include/linux/mlx5/mlx5_ifc_vdpa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h index b86d51a855f67..9594ac4057406 100644 --- a/include/linux/mlx5/mlx5_ifc_vdpa.h +++ b/include/linux/mlx5/mlx5_ifc_vdpa.h @@ -145,6 +145,7 @@ enum { MLX5_VIRTQ_MODIFY_MASK_STATE = (u64)1 << 0, MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_PARAMS = (u64)1 << 3, MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_DUMP_ENABLE = (u64)1 << 4, + MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS = (u64)1 << 6, MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY = (u64)1 << 14, }; -- cgit v1.2.3 From 60c43b3f6b4eb5a3d672952a0d65991f414ea258 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Mon, 25 Dec 2023 17:12:00 +0200 Subject: vdpa/mlx5: Mark vq state for modification in hw vq .set_vq_state will set the indices and mark the fields to be modified in the hw vq. Advertise that the device supports changing the vq state when the device is in DRIVER_OK state and suspended. Reviewed-by: Gal Pressman Signed-off-by: Dragos Tatulea Acked-by: Jason Wang Message-Id: <20231225151203.152687-6-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- include/linux/mlx5/mlx5_ifc_vdpa.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h index 9594ac4057406..32e712106e684 100644 --- a/include/linux/mlx5/mlx5_ifc_vdpa.h +++ b/include/linux/mlx5/mlx5_ifc_vdpa.h @@ -146,6 +146,8 @@ enum { MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_PARAMS = (u64)1 << 3, MLX5_VIRTQ_MODIFY_MASK_DIRTY_BITMAP_DUMP_ENABLE = (u64)1 << 4, MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS = (u64)1 << 6, + MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX = (u64)1 << 7, + MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX = (u64)1 << 8, MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY = (u64)1 << 14, }; -- cgit v1.2.3 From f756dd3e2a4c704c0ab5ecb143ab71f1249af497 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Mon, 25 Dec 2023 17:12:01 +0200 Subject: vdpa/mlx5: Use vq suspend/resume during .set_map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of tearing down and setting up vq resources, use vq suspend/resume during .set_map to speed things up a bit. The vq mr is updated with the new mapping while the vqs are suspended. If the device doesn't support resumable vqs, do the old teardown and setup dance. Reviewed-by: Gal Pressman Acked-by: Eugenio Pérez Acked-by: Jason Wang Signed-off-by: Dragos Tatulea Message-Id: <20231225151203.152687-7-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin --- include/linux/mlx5/mlx5_ifc_vdpa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h index 32e712106e684..40371c916cf94 100644 --- a/include/linux/mlx5/mlx5_ifc_vdpa.h +++ b/include/linux/mlx5/mlx5_ifc_vdpa.h @@ -148,6 +148,7 @@ enum { MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS = (u64)1 << 6, MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX = (u64)1 << 7, MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX = (u64)1 << 8, + MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY = (u64)1 << 11, MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY = (u64)1 << 14, }; -- cgit v1.2.3 From 3e999770ac1c7c31a70685dd5b88e89473509e9c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 9 Jan 2024 17:59:22 +0100 Subject: PM: sleep: Restore asynchronous device resume optimization Before commit 7839d0078e0d ("PM: sleep: Fix possible deadlocks in core system-wide PM code"), the resume of devices that were allowed to resume asynchronously was scheduled before starting the resume of the other devices, so the former did not have to wait for the latter unless functional dependencies were present. Commit 7839d0078e0d removed that optimization in order to address a correctness issue, but it can be restored with the help of a new device power management flag, so do that now. Signed-off-by: Rafael J. Wysocki Reviewed-by: Stanislaw Gruszka --- include/linux/pm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pm.h b/include/linux/pm.h index 92a4f69de0e80..a2f3e53a8196d 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -681,6 +681,7 @@ struct dev_pm_info { bool wakeup_path:1; bool syscore:1; bool no_pm_callbacks:1; /* Owned by the PM core */ + bool async_in_progress:1; /* Owned by the PM core */ unsigned int must_resume:1; /* Owned by the PM core */ unsigned int may_skip_resume:1; /* Set by subsystems */ #else -- cgit v1.2.3 From f35b88b66fbb5c90298ce3aa483b8a2cf1f39ad0 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 10 Jan 2024 20:10:08 -0800 Subject: iommu: Add cache_invalidate_user op The updates of the PTEs in the nested page table will be propagated to the hardware caches. Add a new domain op cache_invalidate_user() for the userspace to flush the hardware caches for a nested domain through iommufd. No wrapper for it, as it's only supposed to be used by iommufd. Then, pass in invalidation requests in form of a user data array containing a number of invalidation data entries. Link: https://lore.kernel.org/r/20240111041015.47920-2-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Lu Baolu Signed-off-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 6291aa7b079b0..93c0d12dd047c 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -284,6 +284,23 @@ struct iommu_user_data { size_t len; }; +/** + * struct iommu_user_data_array - iommu driver specific user space data array + * @type: The data type of all the entries in the user buffer array + * @uptr: Pointer to the user buffer array + * @entry_len: The fixed-width length of an entry in the array, in bytes + * @entry_num: The number of total entries in the array + * + * The user buffer includes an array of requests with format defined in + * include/uapi/linux/iommufd.h + */ +struct iommu_user_data_array { + unsigned int type; + void __user *uptr; + size_t entry_len; + u32 entry_num; +}; + /** * __iommu_copy_struct_from_user - Copy iommu driver specific user space data * @dst_data: Pointer to an iommu driver specific user data that is defined in @@ -440,6 +457,13 @@ struct iommu_ops { * @iotlb_sync_map: Sync mappings created recently using @map to the hardware * @iotlb_sync: Flush all queued ranges from the hardware TLBs and empty flush * queue + * @cache_invalidate_user: Flush hardware cache for user space IO page table. + * The @domain must be IOMMU_DOMAIN_NESTED. The @array + * passes in the cache invalidation requests, in form + * of a driver data structure. The driver must update + * array->entry_num to report the number of handled + * invalidation requests. The driver data structure + * must be defined in include/uapi/linux/iommufd.h * @iova_to_phys: translate iova to physical address * @enforce_cache_coherency: Prevent any kind of DMA from bypassing IOMMU_CACHE, * including no-snoop TLPs on PCIe or other platform @@ -465,6 +489,8 @@ struct iommu_domain_ops { size_t size); void (*iotlb_sync)(struct iommu_domain *domain, struct iommu_iotlb_gather *iotlb_gather); + int (*cache_invalidate_user)(struct iommu_domain *domain, + struct iommu_user_data_array *array); phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova); -- cgit v1.2.3 From 77785117f9c73fd71a440a5ac86dd80752967adc Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 10 Jan 2024 20:10:10 -0800 Subject: iommu: Add iommu_copy_struct_from_user_array helper Wrap up the data pointer/num sanity and __iommu_copy_struct_from_user() call for iommu drivers to copy driver specific data at a specific location in the struct iommu_user_data_array. And expect it to be used in cache_invalidate_user ops for example. Link: https://lore.kernel.org/r/20240111041015.47920-4-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Co-developed-by: Yi Liu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe --- include/linux/iommu.h | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 93c0d12dd047c..9dbadf74a3a1e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -341,6 +341,57 @@ static inline int __iommu_copy_struct_from_user( sizeof(*kdst), \ offsetofend(typeof(*kdst), min_last)) +/** + * __iommu_copy_struct_from_user_array - Copy iommu driver specific user space + * data from an iommu_user_data_array + * @dst_data: Pointer to an iommu driver specific user data that is defined in + * include/uapi/linux/iommufd.h + * @src_array: Pointer to a struct iommu_user_data_array for a user space array + * @data_type: The data type of the @dst_data. Must match with @src_array.type + * @index: Index to the location in the array to copy user data from + * @data_len: Length of current user data structure, i.e. sizeof(struct _dst) + * @min_len: Initial length of user data structure for backward compatibility. + * This should be offsetofend using the last member in the user data + * struct that was initially added to include/uapi/linux/iommufd.h + */ +static inline int __iommu_copy_struct_from_user_array( + void *dst_data, const struct iommu_user_data_array *src_array, + unsigned int data_type, unsigned int index, size_t data_len, + size_t min_len) +{ + struct iommu_user_data src_data; + + if (WARN_ON(!src_array || index >= src_array->entry_num)) + return -EINVAL; + if (!src_array->entry_num) + return -EINVAL; + src_data.uptr = src_array->uptr + src_array->entry_len * index; + src_data.len = src_array->entry_len; + src_data.type = src_array->type; + + return __iommu_copy_struct_from_user(dst_data, &src_data, data_type, + data_len, min_len); +} + +/** + * iommu_copy_struct_from_user_array - Copy iommu driver specific user space + * data from an iommu_user_data_array + * @kdst: Pointer to an iommu driver specific user data that is defined in + * include/uapi/linux/iommufd.h + * @user_array: Pointer to a struct iommu_user_data_array for a user space + * array + * @data_type: The data type of the @kdst. Must match with @user_array->type + * @index: Index to the location in the array to copy user data from + * @min_last: The last member of the data structure @kdst points in the + * initial version. + * Return 0 for success, otherwise -error. + */ +#define iommu_copy_struct_from_user_array(kdst, user_array, data_type, index, \ + min_last) \ + __iommu_copy_struct_from_user_array( \ + kdst, user_array, data_type, index, sizeof(*(kdst)), \ + offsetofend(typeof(*(kdst)), min_last)) + /** * struct iommu_ops - iommu ops and capabilities * @capable: check capability -- cgit v1.2.3 From dee56ccb468a832074397fdbf22bbd9bf6d710aa Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 12 Dec 2023 15:18:02 +0100 Subject: fbdev: amba-clcd: Delete the old CLCD driver We have managed to ascertain that all users of the old FBDEV code that are out of tree are now gone. The new DRM driver can be found in drivers/gpu/drm/pl111/. The remaining out of tree user was the ARM FVP emulation platform, running Android. Thanks to changes in Android versions 13 and 14, Android can now use the DRM driver when being emulated under FVP. Some final patches are being put in place to make it fully featured. This is essentially a revert of the partial revert in commit 112c35237c72 ("Partially revert "video: fbdev: amba-clcd: Retire elder CLCD driver"") Signed-off-by: Linus Walleij Signed-off-by: Helge Deller --- include/linux/amba/clcd-regs.h | 87 ------------- include/linux/amba/clcd.h | 290 ----------------------------------------- 2 files changed, 377 deletions(-) delete mode 100644 include/linux/amba/clcd-regs.h delete mode 100644 include/linux/amba/clcd.h (limited to 'include/linux') diff --git a/include/linux/amba/clcd-regs.h b/include/linux/amba/clcd-regs.h deleted file mode 100644 index 421b0fa90d6ac..0000000000000 --- a/include/linux/amba/clcd-regs.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * David A Rusling - * - * Copyright (C) 2001 ARM Limited - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file COPYING in the main directory of this archive - * for more details. - */ - -#ifndef AMBA_CLCD_REGS_H -#define AMBA_CLCD_REGS_H - -/* - * CLCD Controller Internal Register addresses - */ -#define CLCD_TIM0 0x00000000 -#define CLCD_TIM1 0x00000004 -#define CLCD_TIM2 0x00000008 -#define CLCD_TIM3 0x0000000c -#define CLCD_UBAS 0x00000010 -#define CLCD_LBAS 0x00000014 - -#define CLCD_PL110_IENB 0x00000018 -#define CLCD_PL110_CNTL 0x0000001c -#define CLCD_PL110_STAT 0x00000020 -#define CLCD_PL110_INTR 0x00000024 -#define CLCD_PL110_UCUR 0x00000028 -#define CLCD_PL110_LCUR 0x0000002C - -#define CLCD_PL111_CNTL 0x00000018 -#define CLCD_PL111_IENB 0x0000001c -#define CLCD_PL111_RIS 0x00000020 -#define CLCD_PL111_MIS 0x00000024 -#define CLCD_PL111_ICR 0x00000028 -#define CLCD_PL111_UCUR 0x0000002c -#define CLCD_PL111_LCUR 0x00000030 - -#define CLCD_PALL 0x00000200 -#define CLCD_PALETTE 0x00000200 - -#define TIM2_PCD_LO_MASK GENMASK(4, 0) -#define TIM2_PCD_LO_BITS 5 -#define TIM2_CLKSEL (1 << 5) -#define TIM2_ACB_MASK GENMASK(10, 6) -#define TIM2_IVS (1 << 11) -#define TIM2_IHS (1 << 12) -#define TIM2_IPC (1 << 13) -#define TIM2_IOE (1 << 14) -#define TIM2_BCD (1 << 26) -#define TIM2_PCD_HI_MASK GENMASK(31, 27) -#define TIM2_PCD_HI_BITS 5 -#define TIM2_PCD_HI_SHIFT 27 - -#define CNTL_LCDEN (1 << 0) -#define CNTL_LCDBPP1 (0 << 1) -#define CNTL_LCDBPP2 (1 << 1) -#define CNTL_LCDBPP4 (2 << 1) -#define CNTL_LCDBPP8 (3 << 1) -#define CNTL_LCDBPP16 (4 << 1) -#define CNTL_LCDBPP16_565 (6 << 1) -#define CNTL_LCDBPP16_444 (7 << 1) -#define CNTL_LCDBPP24 (5 << 1) -#define CNTL_LCDBW (1 << 4) -#define CNTL_LCDTFT (1 << 5) -#define CNTL_LCDMONO8 (1 << 6) -#define CNTL_LCDDUAL (1 << 7) -#define CNTL_BGR (1 << 8) -#define CNTL_BEBO (1 << 9) -#define CNTL_BEPO (1 << 10) -#define CNTL_LCDPWR (1 << 11) -#define CNTL_LCDVCOMP(x) ((x) << 12) -#define CNTL_LDMAFIFOTIME (1 << 15) -#define CNTL_WATERMARK (1 << 16) - -/* ST Microelectronics variant bits */ -#define CNTL_ST_1XBPP_444 0x0 -#define CNTL_ST_1XBPP_5551 (1 << 17) -#define CNTL_ST_1XBPP_565 (1 << 18) -#define CNTL_ST_CDWID_12 0x0 -#define CNTL_ST_CDWID_16 (1 << 19) -#define CNTL_ST_CDWID_18 (1 << 20) -#define CNTL_ST_CDWID_24 ((1 << 19)|(1 << 20)) -#define CNTL_ST_CEAEN (1 << 21) -#define CNTL_ST_LCDBPP24_PACKED (6 << 1) - -#endif /* AMBA_CLCD_REGS_H */ diff --git a/include/linux/amba/clcd.h b/include/linux/amba/clcd.h deleted file mode 100644 index b6e0cbeaf533e..0000000000000 --- a/include/linux/amba/clcd.h +++ /dev/null @@ -1,290 +0,0 @@ -/* - * linux/include/asm-arm/hardware/amba_clcd.h -- Integrator LCD panel. - * - * David A Rusling - * - * Copyright (C) 2001 ARM Limited - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file COPYING in the main directory of this archive - * for more details. - */ -#include -#include - -enum { - /* individual formats */ - CLCD_CAP_RGB444 = (1 << 0), - CLCD_CAP_RGB5551 = (1 << 1), - CLCD_CAP_RGB565 = (1 << 2), - CLCD_CAP_RGB888 = (1 << 3), - CLCD_CAP_BGR444 = (1 << 4), - CLCD_CAP_BGR5551 = (1 << 5), - CLCD_CAP_BGR565 = (1 << 6), - CLCD_CAP_BGR888 = (1 << 7), - - /* connection layouts */ - CLCD_CAP_444 = CLCD_CAP_RGB444 | CLCD_CAP_BGR444, - CLCD_CAP_5551 = CLCD_CAP_RGB5551 | CLCD_CAP_BGR5551, - CLCD_CAP_565 = CLCD_CAP_RGB565 | CLCD_CAP_BGR565, - CLCD_CAP_888 = CLCD_CAP_RGB888 | CLCD_CAP_BGR888, - - /* red/blue ordering */ - CLCD_CAP_RGB = CLCD_CAP_RGB444 | CLCD_CAP_RGB5551 | - CLCD_CAP_RGB565 | CLCD_CAP_RGB888, - CLCD_CAP_BGR = CLCD_CAP_BGR444 | CLCD_CAP_BGR5551 | - CLCD_CAP_BGR565 | CLCD_CAP_BGR888, - - CLCD_CAP_ALL = CLCD_CAP_BGR | CLCD_CAP_RGB, -}; - -struct backlight_device; - -struct clcd_panel { - struct fb_videomode mode; - signed short width; /* width in mm */ - signed short height; /* height in mm */ - u32 tim2; - u32 tim3; - u32 cntl; - u32 caps; - unsigned int bpp:8, - fixedtimings:1, - grayscale:1; - unsigned int connector; - struct backlight_device *backlight; - /* - * If the B/R lines are switched between the CLCD - * and the panel we need to know this and not try to - * compensate with the BGR bit in the control register. - */ - bool bgr_connection; -}; - -struct clcd_regs { - u32 tim0; - u32 tim1; - u32 tim2; - u32 tim3; - u32 cntl; - unsigned long pixclock; -}; - -struct clcd_fb; - -/* - * the board-type specific routines - */ -struct clcd_board { - const char *name; - - /* - * Optional. Hardware capability flags. - */ - u32 caps; - - /* - * Optional. Check whether the var structure is acceptable - * for this display. - */ - int (*check)(struct clcd_fb *fb, struct fb_var_screeninfo *var); - - /* - * Compulsory. Decode fb->fb.var into regs->*. In the case of - * fixed timing, set regs->* to the register values required. - */ - void (*decode)(struct clcd_fb *fb, struct clcd_regs *regs); - - /* - * Optional. Disable any extra display hardware. - */ - void (*disable)(struct clcd_fb *); - - /* - * Optional. Enable any extra display hardware. - */ - void (*enable)(struct clcd_fb *); - - /* - * Setup platform specific parts of CLCD driver - */ - int (*setup)(struct clcd_fb *); - - /* - * mmap the framebuffer memory - */ - int (*mmap)(struct clcd_fb *, struct vm_area_struct *); - - /* - * Remove platform specific parts of CLCD driver - */ - void (*remove)(struct clcd_fb *); -}; - -struct amba_device; -struct clk; - -/* this data structure describes each frame buffer device we find */ -struct clcd_fb { - struct fb_info fb; - struct amba_device *dev; - struct clk *clk; - struct clcd_panel *panel; - struct clcd_board *board; - void *board_data; - void __iomem *regs; - u16 off_ienb; - u16 off_cntl; - u32 clcd_cntl; - u32 cmap[16]; - bool clk_enabled; -}; - -static inline void clcdfb_decode(struct clcd_fb *fb, struct clcd_regs *regs) -{ - struct fb_var_screeninfo *var = &fb->fb.var; - u32 val, cpl; - - /* - * Program the CLCD controller registers and start the CLCD - */ - val = ((var->xres / 16) - 1) << 2; - val |= (var->hsync_len - 1) << 8; - val |= (var->right_margin - 1) << 16; - val |= (var->left_margin - 1) << 24; - regs->tim0 = val; - - val = var->yres; - if (fb->panel->cntl & CNTL_LCDDUAL) - val /= 2; - val -= 1; - val |= (var->vsync_len - 1) << 10; - val |= var->lower_margin << 16; - val |= var->upper_margin << 24; - regs->tim1 = val; - - val = fb->panel->tim2; - val |= var->sync & FB_SYNC_HOR_HIGH_ACT ? 0 : TIM2_IHS; - val |= var->sync & FB_SYNC_VERT_HIGH_ACT ? 0 : TIM2_IVS; - - cpl = var->xres_virtual; - if (fb->panel->cntl & CNTL_LCDTFT) /* TFT */ - /* / 1 */; - else if (!var->grayscale) /* STN color */ - cpl = cpl * 8 / 3; - else if (fb->panel->cntl & CNTL_LCDMONO8) /* STN monochrome, 8bit */ - cpl /= 8; - else /* STN monochrome, 4bit */ - cpl /= 4; - - regs->tim2 = val | ((cpl - 1) << 16); - - regs->tim3 = fb->panel->tim3; - - val = fb->panel->cntl; - if (var->grayscale) - val |= CNTL_LCDBW; - - if (fb->panel->caps && fb->board->caps && var->bits_per_pixel >= 16) { - /* - * if board and panel supply capabilities, we can support - * changing BGR/RGB depending on supplied parameters. Here - * we switch to what the framebuffer is providing if need - * be, so if the framebuffer is BGR but the display connection - * is RGB (first case) we switch it around. Vice versa mutatis - * mutandis if the framebuffer is RGB but the display connection - * is BGR, we flip it around. - */ - if (var->red.offset == 0) - val &= ~CNTL_BGR; - else - val |= CNTL_BGR; - if (fb->panel->bgr_connection) - val ^= CNTL_BGR; - } - - switch (var->bits_per_pixel) { - case 1: - val |= CNTL_LCDBPP1; - break; - case 2: - val |= CNTL_LCDBPP2; - break; - case 4: - val |= CNTL_LCDBPP4; - break; - case 8: - val |= CNTL_LCDBPP8; - break; - case 16: - /* - * PL110 cannot choose between 5551 and 565 modes in its - * control register. It is possible to use 565 with - * custom external wiring. - */ - if (amba_part(fb->dev) == 0x110 || - var->green.length == 5) - val |= CNTL_LCDBPP16; - else if (var->green.length == 6) - val |= CNTL_LCDBPP16_565; - else - val |= CNTL_LCDBPP16_444; - break; - case 32: - val |= CNTL_LCDBPP24; - break; - } - - regs->cntl = val; - regs->pixclock = var->pixclock; -} - -static inline int clcdfb_check(struct clcd_fb *fb, struct fb_var_screeninfo *var) -{ - var->xres_virtual = var->xres = (var->xres + 15) & ~15; - var->yres_virtual = var->yres = (var->yres + 1) & ~1; - -#define CHECK(e,l,h) (var->e < l || var->e > h) - if (CHECK(right_margin, (5+1), 256) || /* back porch */ - CHECK(left_margin, (5+1), 256) || /* front porch */ - CHECK(hsync_len, (5+1), 256) || - var->xres > 4096 || - var->lower_margin > 255 || /* back porch */ - var->upper_margin > 255 || /* front porch */ - var->vsync_len > 32 || - var->yres > 1024) - return -EINVAL; -#undef CHECK - - /* single panel mode: PCD = max(PCD, 1) */ - /* dual panel mode: PCD = max(PCD, 5) */ - - /* - * You can't change the grayscale setting, and - * we can only do non-interlaced video. - */ - if (var->grayscale != fb->fb.var.grayscale || - (var->vmode & FB_VMODE_MASK) != FB_VMODE_NONINTERLACED) - return -EINVAL; - -#define CHECK(e) (var->e != fb->fb.var.e) - if (fb->panel->fixedtimings && - (CHECK(xres) || - CHECK(yres) || - CHECK(bits_per_pixel) || - CHECK(pixclock) || - CHECK(left_margin) || - CHECK(right_margin) || - CHECK(upper_margin) || - CHECK(lower_margin) || - CHECK(hsync_len) || - CHECK(vsync_len) || - CHECK(sync))) - return -EINVAL; -#undef CHECK - - var->nonstd = 0; - var->accel_flags = 0; - - return 0; -} -- cgit v1.2.3 From 7ea26f9460c6c76b1d6e36f39fce34b16cb88300 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 9 Jan 2024 20:22:45 +0200 Subject: fsnotify: compile out fsnotify permission hooks if !FANOTIFY_ACCESS_PERMISSIONS The depency of FANOTIFY_ACCESS_PERMISSIONS on SECURITY made sure that the fsnotify permission hooks were never called when SECURITY was disabled. Moving the fsnotify permission hook out of the secutiy hook broke that optimisation. Reported-and-tested-by: Jens Axboe Closes: https://lore.kernel.org/linux-fsdevel/53682ece-f0e7-48de-9a1c-879ee34b0449@kernel.dk/ Fixes: d9e5d31084b0 ("fsnotify: optionally pass access range in file permission hooks") Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20240109182245.38884-1-amir73il@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fsnotify.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 11e6434b8e714..8300a52869887 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -100,6 +100,7 @@ static inline int fsnotify_file(struct file *file, __u32 mask) return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH); } +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS /* * fsnotify_file_area_perm - permission hook before access to file range */ @@ -145,6 +146,24 @@ static inline int fsnotify_open_perm(struct file *file) return fsnotify_file(file, FS_OPEN_PERM); } +#else +static inline int fsnotify_file_area_perm(struct file *file, int perm_mask, + const loff_t *ppos, size_t count) +{ + return 0; +} + +static inline int fsnotify_file_perm(struct file *file, int perm_mask) +{ + return 0; +} + +static inline int fsnotify_open_perm(struct file *file) +{ + return 0; +} +#endif + /* * fsnotify_link_count - inode's link count changed */ -- cgit v1.2.3 From 755113d7678681a137c330f7997ceb680adb644e Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 9 Jan 2024 10:41:11 +0100 Subject: thermal/debugfs: Add thermal cooling device debugfs information The thermal framework does not have any debug information except a sysfs stat which is a bit controversial. This one allocates big chunks of memory for every cooling devices with a high number of states and could represent on some systems in production several megabytes of memory for just a portion of it. As the sysfs is limited to a page size, the output is not exploitable with large data array and gets truncated. The patch provides the same information than sysfs except the transitions are dynamically allocated, thus they won't show more events than the ones which actually occurred. There is no longer a size limitation and it opens the field for more debugging information where the debugfs is designed for, not sysfs. The thermal debugfs directory structure tries to stay consistent with the sysfs one but in a very simplified way: thermal/ -- cooling_devices |-- 0 | |-- clear | |-- time_in_state_ms | |-- total_trans | `-- trans_table |-- 1 | |-- clear | |-- time_in_state_ms | |-- total_trans | `-- trans_table |-- 2 | |-- clear | |-- time_in_state_ms | |-- total_trans | `-- trans_table |-- 3 | |-- clear | |-- time_in_state_ms | |-- total_trans | `-- trans_table `-- 4 |-- clear |-- time_in_state_ms |-- total_trans `-- trans_table The content of the files in the cooling devices directory is the same as the sysfs one except for the trans_table which has the following format: Transition Hits 1->0 246 0->1 246 2->1 632 1->2 632 3->2 98 2->3 98 Signed-off-by: Daniel Lezcano [ rjw: White space fixups, rebase ] Signed-off-by: Rafael J. Wysocki --- include/linux/thermal.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 9d0427da32af5..7defea8fa2230 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -32,6 +32,7 @@ struct thermal_zone_device; struct thermal_cooling_device; struct thermal_instance; +struct thermal_debugfs; struct thermal_attr; enum thermal_trend { @@ -113,6 +114,9 @@ struct thermal_cooling_device { struct mutex lock; /* protect thermal_instances list */ struct list_head thermal_instances; struct list_head node; +#ifdef CONFIG_THERMAL_DEBUGFS + struct thermal_debugfs *debugfs; +#endif }; /** @@ -189,6 +193,9 @@ struct thermal_zone_device { struct list_head node; struct delayed_work poll_queue; enum thermal_notify_event notify_event; +#ifdef CONFIG_THERMAL_DEBUGFS + struct thermal_debugfs *debugfs; +#endif bool suspended; }; -- cgit v1.2.3 From 57a427c81c322c5f0cdfe7c46cdee553d18b1ec6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 5 Jan 2024 14:45:11 +0100 Subject: thermal: core: Use kstrdup_const() during cooling device registration Some *thermal_cooling_device_register() calls pass a string literal as the 'type' parameter, so kstrdup_const() can be used instead of kstrdup() to avoid a memory allocation in such cases. Signed-off-by: Christophe JAILLET [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- include/linux/thermal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 7defea8fa2230..3227335fb4471 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -103,7 +103,7 @@ struct thermal_cooling_device_ops { struct thermal_cooling_device { int id; - char *type; + const char *type; unsigned long max_state; struct device device; struct device_node *np; -- cgit v1.2.3 From ba5afb9a84df2e6b26a1b6389b98849cd16ea757 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Jan 2024 09:09:14 +0100 Subject: fs: rework listmount() implementation Linus pointed out that there's error handling and naming issues in the that we should rewrite: * Perform the access checks for the buffer before actually doing any work instead of doing it during the iteration. * Rename the arguments to listmount() and do_listmount() to clarify what the arguments are used for. * Get rid of the pointless ctr variable and overflow checking. * Get rid of the pointless speculation check. Link: https://lore.kernel.org/r/CAHk-=wjh6Cypo8WC-McXgSzCaou3UXccxB+7PVeSuGR8AjCphg@mail.gmail.com Suggested-by: Linus Torvalds Signed-off-by: Christian Brauner --- include/linux/syscalls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 5c0dbef55792f..cdba4d0c6d4a8 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -414,7 +414,7 @@ asmlinkage long sys_statmount(const struct mnt_id_req __user *req, struct statmount __user *buf, size_t bufsize, unsigned int flags); asmlinkage long sys_listmount(const struct mnt_id_req __user *req, - u64 __user *buf, size_t bufsize, + u64 __user *mnt_ids, size_t nr_mnt_ids, unsigned int flags); asmlinkage long sys_truncate(const char __user *path, long length); asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length); -- cgit v1.2.3 From 9181d6f8a2bb32d158de66a84164fac05e3ddd18 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Jan 2024 12:28:16 +0000 Subject: net: add more sanity check in virtio_net_hdr_to_skb() syzbot/KMSAN reports access to uninitialized data from gso_features_check() [1] The repro use af_packet, injecting a gso packet and hdrlen == 0. We could fix the issue making gso_features_check() more careful while dealing with NETIF_F_TSO_MANGLEID in fast path. Or we can make sure virtio_net_hdr_to_skb() pulls minimal network and transport headers as intended. Note that for GSO packets coming from untrusted sources, SKB_GSO_DODGY bit forces a proper header validation (and pull) before the packet can hit any device ndo_start_xmit(), thus we do not need a precise disection at virtio_net_hdr_to_skb() stage. [1] BUG: KMSAN: uninit-value in skb_gso_segment include/net/gso.h:83 [inline] BUG: KMSAN: uninit-value in validate_xmit_skb+0x10f2/0x1930 net/core/dev.c:3629 skb_gso_segment include/net/gso.h:83 [inline] validate_xmit_skb+0x10f2/0x1930 net/core/dev.c:3629 __dev_queue_xmit+0x1eac/0x5130 net/core/dev.c:4341 dev_queue_xmit include/linux/netdevice.h:3134 [inline] packet_xmit+0x9c/0x6b0 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3087 [inline] packet_sendmsg+0x8b1d/0x9f30 net/packet/af_packet.c:3119 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b Uninit was created at: slab_post_alloc_hook+0x129/0xa70 mm/slab.h:768 slab_alloc_node mm/slub.c:3478 [inline] kmem_cache_alloc_node+0x5e9/0xb10 mm/slub.c:3523 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:560 __alloc_skb+0x318/0x740 net/core/skbuff.c:651 alloc_skb include/linux/skbuff.h:1286 [inline] alloc_skb_with_frags+0xc8/0xbd0 net/core/skbuff.c:6334 sock_alloc_send_pskb+0xa80/0xbf0 net/core/sock.c:2780 packet_alloc_skb net/packet/af_packet.c:2936 [inline] packet_snd net/packet/af_packet.c:3030 [inline] packet_sendmsg+0x70e8/0x9f30 net/packet/af_packet.c:3119 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b CPU: 0 PID: 5025 Comm: syz-executor279 Not tainted 6.7.0-rc7-syzkaller-00003-gfbafc3e621c3 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023 Reported-by: syzbot+7f4d0ea3df4d4fa9a65f@syzkaller.appspotmail.com Link: https://lore.kernel.org/netdev/0000000000005abd7b060eb160cd@google.com/ Fixes: 9274124f023b ("net: stricter validation of untrusted gso packets") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 27cc1d4643219..4dfa9b69ca8d9 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -3,6 +3,8 @@ #define _LINUX_VIRTIO_NET_H #include +#include +#include #include #include #include @@ -49,6 +51,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, const struct virtio_net_hdr *hdr, bool little_endian) { + unsigned int nh_min_len = sizeof(struct iphdr); unsigned int gso_type = 0; unsigned int thlen = 0; unsigned int p_off = 0; @@ -65,6 +68,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, gso_type = SKB_GSO_TCPV6; ip_proto = IPPROTO_TCP; thlen = sizeof(struct tcphdr); + nh_min_len = sizeof(struct ipv6hdr); break; case VIRTIO_NET_HDR_GSO_UDP: gso_type = SKB_GSO_UDP; @@ -100,7 +104,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, if (!skb_partial_csum_set(skb, start, off)) return -EINVAL; - p_off = skb_transport_offset(skb) + thlen; + nh_min_len = max_t(u32, nh_min_len, skb_transport_offset(skb)); + p_off = nh_min_len + thlen; if (!pskb_may_pull(skb, p_off)) return -EINVAL; } else { @@ -140,7 +145,7 @@ retry: skb_set_transport_header(skb, keys.control.thoff); } else if (gso_type) { - p_off = thlen; + p_off = nh_min_len + thlen; if (!pskb_may_pull(skb, p_off)) return -EINVAL; } -- cgit v1.2.3 From 34d946b723b53488ab39d8ac540ddf9db255317a Mon Sep 17 00:00:00 2001 From: Frank Li Date: Tue, 9 Jan 2024 00:25:48 -0500 Subject: i3c: master: fix kernel-doc check warning Fix warning found by 'scripts/kernel-doc -v -none include/linux/i3c/master.h' include/linux/i3c/master.h:457: warning: Function parameter or member 'enable_hotjoin' not described in 'i3c_master_controller_ops' include/linux/i3c/master.h:457: warning: Function parameter or member 'disable_hotjoin' not described in 'i3c_master_controller_ops' include/linux/i3c/master.h:499: warning: Function parameter or member 'hotjoin' not described in 'i3c_master_controller' Signed-off-by: Frank Li Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/r/20240109052548.2128133-1-Frank.Li@nxp.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/master.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index 1ecd73b17ff5d..0ca27dd869561 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -433,6 +433,8 @@ struct i3c_bus { * for a future IBI * This method is mandatory only if ->request_ibi is not * NULL. + * @enable_hotjoin: enable hot join event detect. + * @disable_hotjoin: disable hot join event detect. */ struct i3c_master_controller_ops { int (*bus_init)(struct i3c_master_controller *master); @@ -474,6 +476,7 @@ struct i3c_master_controller_ops { * @ops: master operations. See &struct i3c_master_controller_ops * @secondary: true if the master is a secondary master * @init_done: true when the bus initialization is done + * @hotjoin: true if the master support hotjoin * @boardinfo.i3c: list of I3C boardinfo objects * @boardinfo.i2c: list of I2C boardinfo objects * @boardinfo: board-level information attached to devices connected on the bus -- cgit v1.2.3 From aaefabc4a5f7ae48682c4d2d5d10faaf95c08eb9 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 7 Nov 2023 10:44:41 +0800 Subject: ceph: try to allocate a smaller extent map for sparse read In fscrypt case and for a smaller read length we can predict the max count of the extent map. And for small read length use cases this could save some memories. [ idryomov: squash into a single patch to avoid build break, drop redundant variable in ceph_alloc_sparse_ext_map() ] Signed-off-by: Xiubo Li Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index b8610e9d2471f..fa018d5864e74 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -572,9 +572,12 @@ int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt); */ #define CEPH_SPARSE_EXT_ARRAY_INITIAL 16 -static inline int ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op) +static inline int ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt) { - return __ceph_alloc_sparse_ext_map(op, CEPH_SPARSE_EXT_ARRAY_INITIAL); + if (!cnt) + cnt = CEPH_SPARSE_EXT_ARRAY_INITIAL; + + return __ceph_alloc_sparse_ext_map(op, cnt); } extern void ceph_osdc_get_request(struct ceph_osd_request *req); -- cgit v1.2.3 From 832b371097eb928d077c827b8f117bf5b99d35c0 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Mon, 15 Jan 2024 16:05:26 +0100 Subject: gpiolib: Fix scope-based gpio_device refcounting Commit 9e4555d1e54a ("gpiolib: add support for scope-based management to gpio_device") sought to add scope-based gpio_device refcounting, but erroneously forgot a negation of IS_ERR_OR_NULL(). As a result, gpio_device_put() is not called if the gpio_device pointer is valid (meaning the ref is leaked), but only called if the pointer is NULL or an ERR_PTR(). While at it drop a superfluous trailing semicolon. Fixes: 9e4555d1e54a ("gpiolib: add support for scope-based management to gpio_device") Signed-off-by: Lukas Wunner Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index e846bd4e7559b..9a5c6c76e6533 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -635,7 +635,7 @@ struct gpio_device *gpio_device_get(struct gpio_device *gdev); void gpio_device_put(struct gpio_device *gdev); DEFINE_FREE(gpio_device_put, struct gpio_device *, - if (IS_ERR_OR_NULL(_T)) gpio_device_put(_T)); + if (!IS_ERR_OR_NULL(_T)) gpio_device_put(_T)) struct device *gpio_device_to_device(struct gpio_device *gdev); -- cgit v1.2.3 From 2db6b72c989763e30fab83b186e9263fece26bc6 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 6 Dec 2023 12:17:51 -0600 Subject: PCI: Fix kernel-doc issues Fix kernel-doc issues reported by "find include -name \*pci\* | xargs scripts/kernel-doc -none": include/linux/pci.h:731: warning: Function parameter or member 'pdev' not described in 'pci_is_vga' include/linux/pci-epc.h:154: warning: Function parameter or member 'list_lock' not described in 'pci_epc' include/linux/pci-epf.h:83: warning: expecting prototype for struct pci_epf_event_ops. Prototype was for struct pci_epc_event_ops instead Link: https://lore.kernel.org/r/20240111162850.2177655-1-helgaas@kernel.org Tested-by: Randy Dunlap Signed-off-by: Bjorn Helgaas Reviewed-by: Manivannan Sadhasivam Acked-by: Randy Dunlap Acked-by: Sui Jingfeng --- include/linux/pci-epc.h | 2 +- include/linux/pci-epf.h | 2 +- include/linux/pci.h | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index 5cb6940310729..bfe41b03b70ca 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -122,7 +122,7 @@ struct pci_epc_mem { * struct pci_epc - represents the PCI EPC device * @dev: PCI EPC device * @pci_epf: list of endpoint functions present in this EPC device - * list_lock: Mutex for protecting pci_epf list + * @list_lock: Mutex for protecting pci_epf list * @ops: function pointers for performing endpoint operations * @windows: array of address space of the endpoint controller * @mem: first window of the endpoint controller, which corresponds to diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h index 3f44b6aec4770..92d0b71d33d7f 100644 --- a/include/linux/pci-epf.h +++ b/include/linux/pci-epf.h @@ -68,7 +68,7 @@ struct pci_epf_ops { }; /** - * struct pci_epf_event_ops - Callbacks for capturing the EPC events + * struct pci_epc_event_ops - Callbacks for capturing the EPC events * @core_init: Callback for the EPC initialization complete event * @link_up: Callback for the EPC link up event * @link_down: Callback for the EPC link down event diff --git a/include/linux/pci.h b/include/linux/pci.h index 1a89dc66f89ac..eb45087d7e001 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -715,6 +715,7 @@ static inline bool pci_is_bridge(struct pci_dev *dev) /** * pci_is_vga - check if the PCI device is a VGA device + * @pdev: PCI device * * The PCI Code and ID Assignment spec, r1.15, secs 1.4 and 1.1, define * VGA Base Class and Sub-Classes: -- cgit v1.2.3 From 5f4c01f1e3c7b0c8d1e5dd6f080531de7aa5e47b Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Mon, 15 Jan 2024 17:19:34 -0300 Subject: spinlock: Fix failing build for PREEMPT_RT Since 1d71b30e1f85 ("sched.h: Move (spin|rwlock)_needbreak() to spinlock.h") build fails for PREEMPT_RT, since there is no definition available of either spin_needbreak() and rwlock_needbreak(). Since it was moved on the mentioned commit, it was placed inside a !PREEMPT_RT part of the code, making it out of reach for an RT kernel. Fix this by moving code it a few lines down so it can be reached by an RT build, where it can also make use of the *_is_contended() definition added by the spinlock_rt.h. Fixes: d1d71b30e1f85 ("sched.h: Move (spin|rwlock)_needbreak() to spinlock.h") Signed-off-by: Leonardo Bras Signed-off-by: Kent Overstreet Acked-by: Waiman Long --- include/linux/spinlock.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 0c71f06454d9e..b5c59fdad160f 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -449,6 +449,12 @@ static __always_inline int spin_is_contended(spinlock_t *lock) return raw_spin_is_contended(&lock->rlock); } +#define assert_spin_locked(lock) assert_raw_spin_locked(&(lock)->rlock) + +#else /* !CONFIG_PREEMPT_RT */ +# include +#endif /* CONFIG_PREEMPT_RT */ + /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPTION, @@ -480,12 +486,6 @@ static inline int rwlock_needbreak(rwlock_t *lock) #endif } -#define assert_spin_locked(lock) assert_raw_spin_locked(&(lock)->rlock) - -#else /* !CONFIG_PREEMPT_RT */ -# include -#endif /* CONFIG_PREEMPT_RT */ - /* * Pull the atomic_t declaration: * (asm-mips/atomic.h needs above definitions) -- cgit v1.2.3 From 7bed6f3d08b7af27b7015da8dc3acf2b9c1f21d7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 16 Jan 2024 21:29:59 +0000 Subject: block: Fix iterating over an empty bio with bio_for_each_folio_all If the bio contains no data, bio_first_folio() calls page_folio() on a NULL pointer and oopses. Move the test that we've reached the end of the bio from bio_next_folio() to bio_first_folio(). Reported-by: syzbot+8b23309d5788a79d3eea@syzkaller.appspotmail.com Reported-by: syzbot+004c1e0fced2b4bc3dcc@syzkaller.appspotmail.com Fixes: 640d1930bef4 ("block: Add bio_for_each_folio_all()") Cc: stable@vger.kernel.org Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240116212959.3413014-1-willy@infradead.org [axboe: add unlikely() to error case] Signed-off-by: Jens Axboe --- include/linux/bio.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index ec4db73e5f4ec..875d792bffff8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -286,6 +286,11 @@ static inline void bio_first_folio(struct folio_iter *fi, struct bio *bio, { struct bio_vec *bvec = bio_first_bvec_all(bio) + i; + if (unlikely(i >= bio->bi_vcnt)) { + fi->folio = NULL; + return; + } + fi->folio = page_folio(bvec->bv_page); fi->offset = bvec->bv_offset + PAGE_SIZE * (bvec->bv_page - &fi->folio->page); @@ -303,10 +308,8 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio) fi->offset = 0; fi->length = min(folio_size(fi->folio), fi->_seg_count); fi->_next = folio_next(fi->folio); - } else if (fi->_i + 1 < bio->bi_vcnt) { - bio_first_folio(fi, bio, fi->_i + 1); } else { - fi->folio = NULL; + bio_first_folio(fi, bio, fi->_i + 1); } } -- cgit v1.2.3 From a54e72197037d2c9bfcd70dddaac8c8ccb5b41ba Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 11 Jan 2024 23:06:39 +0800 Subject: netfilter: propagate net to nf_bridge_get_physindev This is a preparation patch for replacing physindev with physinif on nf_bridge_info structure. We will use dev_get_by_index_rcu to resolve device, when needed, and it requires net to be available. Signed-off-by: Pavel Tikhomirov Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index f980edfdd2783..e927b9a15a556 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -56,7 +56,7 @@ static inline int nf_bridge_get_physoutif(const struct sk_buff *skb) } static inline struct net_device * -nf_bridge_get_physindev(const struct sk_buff *skb) +nf_bridge_get_physindev(const struct sk_buff *skb, struct net *net) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); -- cgit v1.2.3 From 9874808878d9eed407e3977fd11fee49de1e1d86 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Thu, 11 Jan 2024 23:06:40 +0800 Subject: netfilter: bridge: replace physindev with physinif in nf_bridge_info An skb can be added to a neigh->arp_queue while waiting for an arp reply. Where original skb's skb->dev can be different to neigh's neigh->dev. For instance in case of bridging dnated skb from one veth to another, the skb would be added to a neigh->arp_queue of the bridge. As skb->dev can be reset back to nf_bridge->physindev and used, and as there is no explicit mechanism that prevents this physindev from been freed under us (for instance neigh_flush_dev doesn't cleanup skbs from different device's neigh queue) we can crash on e.g. this stack: arp_process neigh_update skb = __skb_dequeue(&neigh->arp_queue) neigh_resolve_output(..., skb) ... br_nf_dev_xmit br_nf_pre_routing_finish_bridge_slow skb->dev = nf_bridge->physindev br_handle_frame_finish Let's use plain ifindex instead of net_device link. To peek into the original net_device we will use dev_get_by_index_rcu(). Thus either we get device and are safe to use it or we don't get it and drop skb. Fixes: c4e70a87d975 ("netfilter: bridge: rename br_netfilter.c to br_netfilter_hooks.c") Suggested-by: Florian Westphal Signed-off-by: Pavel Tikhomirov Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge.h | 4 ++-- include/linux/skbuff.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index e927b9a15a556..743475ca7e9d5 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -42,7 +42,7 @@ static inline int nf_bridge_get_physinif(const struct sk_buff *skb) if (!nf_bridge) return 0; - return nf_bridge->physindev ? nf_bridge->physindev->ifindex : 0; + return nf_bridge->physinif; } static inline int nf_bridge_get_physoutif(const struct sk_buff *skb) @@ -60,7 +60,7 @@ nf_bridge_get_physindev(const struct sk_buff *skb, struct net *net) { const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); - return nf_bridge ? nf_bridge->physindev : NULL; + return nf_bridge ? dev_get_by_index_rcu(net, nf_bridge->physinif) : NULL; } static inline struct net_device * diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a5ae952454c89..2dde34c29203b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -295,7 +295,7 @@ struct nf_bridge_info { u8 bridged_dnat:1; u8 sabotage_in_done:1; __u16 frag_max_size; - struct net_device *physindev; + int physinif; /* always valid & non-NULL from FORWARD on, for physdev match */ struct net_device *physoutdev; -- cgit v1.2.3 From 49e60333d743ae32db3bdde2f93bc818482dd741 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 17 Jan 2024 12:36:09 -0800 Subject: blk-mq: Remove the hctx 'run' debugfs attribute Nobody uses the debugfs hctx 'run' attribute. Hence remove this attribute and also the code that updates the corresponding member variable. Suggested-by: Jens Axboe Cc: Gabriel Ryan Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240117203609.4122520-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a676e116085f3..7a8150a5f0513 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -391,9 +391,6 @@ struct blk_mq_hw_ctx { */ struct blk_mq_tags *sched_tags; - /** @run: Number of dispatched requests. */ - unsigned long run; - /** @numa_node: NUMA node the storage adapter has been connected to. */ unsigned int numa_node; /** @queue_num: Index of this hardware queue. */ -- cgit v1.2.3 From 66967a32d3b16ed447e76fed4d946bab52e43d86 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 17 Jan 2024 19:31:40 -0800 Subject: bpf: extract bpf_ctx_convert_map logic and make it more reusable Refactor btf_get_prog_ctx_type() a bit to allow reuse of bpf_ctx_convert_map logic in more than one places. Simplify interface by returning btf_type instead of btf_member (field reference in BTF). To do the above we need to touch and start untangling btf_translate_to_vmlinux() implementation. We do the bare minimum to not regress anything for btf_translate_to_vmlinux(), but its implementation is very questionable for what it claims to be doing. Mapping kfunc argument types to kernel corresponding types conceptually is quite different from recognizing program context types. Fixing this is out of scope for this change though. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240118033143.3384355-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 59d404e22814e..cf5c6ff489812 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -512,7 +512,7 @@ s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner); struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id); -const struct btf_member * +const struct btf_type * btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg); -- cgit v1.2.3 From 7a8e9cdf9405819105ae7405cd91e482bf574b01 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 16 Jan 2024 08:09:25 -0600 Subject: seq_buf: Make DECLARE_SEQ_BUF() usable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using the address operator on the array doesn't work: ./include/linux/seq_buf.h:27:27: error: initialization of ‘char *’ from incompatible pointer type ‘char (*)[128]’ [-Werror=incompatible-pointer-types] 27 | .buffer = &__ ## NAME ## _buffer, \ | ^ Apart from fixing that, we can improve DECLARE_SEQ_BUF() by using a compound literal to define the buffer array without attaching a name to it. This makes the macro a single statement, allowing constructs such as: static DECLARE_SEQ_BUF(my_seq_buf, MYSB_SIZE); to work as intended. Link: https://lkml.kernel.org/r/20240116-declare-seq-buf-fix-v1-1-915db4692f32@linux.ibm.com Cc: stable@vger.kernel.org Acked-by: Kees Cook Fixes: dcc4e5728eea ("seq_buf: Introduce DECLARE_SEQ_BUF and seq_buf_str()") Signed-off-by: Nathan Lynch Signed-off-by: Steven Rostedt (Google) --- include/linux/seq_buf.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index 5fb1f12c33f90..c44f4b47b9453 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -22,9 +22,8 @@ struct seq_buf { }; #define DECLARE_SEQ_BUF(NAME, SIZE) \ - char __ ## NAME ## _buffer[SIZE] = ""; \ struct seq_buf NAME = { \ - .buffer = &__ ## NAME ## _buffer, \ + .buffer = (char[SIZE]) { 0 }, \ .size = SIZE, \ } -- cgit v1.2.3 From b60db383e2ba64a18e49b6bef3be1ab18aa159f1 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 23 Nov 2023 10:40:40 +0100 Subject: include/linux/i2c.h: remove I2C_CLASS_DDC support After removal of the legacy EEPROM driver and I2C_CLASS_DDC support in olpc_dcon there's no i2c client driver left supporting I2C_CLASS_DDC. Class-based device auto-detection is a legacy mechanism and shouldn't be used in new code. So we can remove this class completely now. Signed-off-by: Heiner Kallweit Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 0dae9db275380..d029aade338fd 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -850,7 +850,6 @@ static inline void i2c_mark_adapter_resumed(struct i2c_adapter *adap) /* i2c adapter classes (bitmask) */ #define I2C_CLASS_HWMON (1<<0) /* lm_sensors, ... */ -#define I2C_CLASS_DDC (1<<3) /* DDC bus on graphics adapters */ #define I2C_CLASS_SPD (1<<7) /* Memory modules */ /* Warn users that the adapter doesn't support classes anymore */ #define I2C_CLASS_DEPRECATED (1<<8) -- cgit v1.2.3 From 73febd775bdbdb98c81255ff85773ac410ded5c4 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sun, 12 Nov 2023 17:54:41 -0500 Subject: i2c: create debugfs entry per adapter Two drivers already implement custom debugfs handling for their i2c_adapter and more will come. So, let the core create a debugfs directory per adapter and pass that to drivers for their debugfs files. Signed-off-by: Wolfram Sang Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index d029aade338fd..e01fb1097868c 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -746,6 +746,8 @@ struct i2c_adapter { struct irq_domain *host_notify_domain; struct regulator *bus_regulator; + + struct dentry *debugfs; }; #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev) -- cgit v1.2.3 From 94959c0e796e41128483588d133b9a7003b409f9 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 19 Dec 2023 16:22:43 +0100 Subject: i2c: make i2c_bus_type const Now that the driver core can properly handle constant struct bus_type, move the i2c_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Note, the sound/soc/rockchip/rk3399_gru_sound.c also needed tweaking as it decided to save off a pointer to a bus type for internal stuff, and it was using the i2c_bus_type as well. Signed-off-by: Greg Kroah-Hartman Acked-by: Mark Brown Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index e01fb1097868c..652ecb7abedae 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -23,7 +23,7 @@ #include /* for swab16 */ #include -extern struct bus_type i2c_bus_type; +extern const struct bus_type i2c_bus_type; extern struct device_type i2c_adapter_type; extern struct device_type i2c_client_type; -- cgit v1.2.3 From a8355235dbd571b32c750ee756dd6dac216d18f2 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 8 Nov 2023 07:38:07 +0100 Subject: i2c: mux: reg: Remove class-based device auto-detection support Legacy class-based device auto-detection shouldn't be used in new code. Therefore remove support in i2c-mux-reg as long as we don't have a user of this feature yet. Link: https://lore.kernel.org/linux-i2c/a22978a4-88e4-46f4-b71c-032b22321599@gmail.com/ Signed-off-by: Heiner Kallweit Signed-off-by: Wolfram Sang --- include/linux/platform_data/i2c-mux-reg.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/i2c-mux-reg.h b/include/linux/platform_data/i2c-mux-reg.h index 2543c2a1c9aef..e2e8957683116 100644 --- a/include/linux/platform_data/i2c-mux-reg.h +++ b/include/linux/platform_data/i2c-mux-reg.h @@ -17,7 +17,6 @@ * @n_values: Number of multiplexer channels * @little_endian: Indicating if the register is in little endian * @write_only: Reading the register is not allowed by hardware - * @classes: Optional I2C auto-detection classes * @idle: Value to write to mux when idle * @idle_in_use: indicate if idle value is in use * @reg: Virtual address of the register to switch channel @@ -30,7 +29,6 @@ struct i2c_mux_reg_platform_data { int n_values; bool little_endian; bool write_only; - const unsigned int *classes; u32 idle; bool idle_in_use; void __iomem *reg; -- cgit v1.2.3 From ef175b29a242fea98f467f008237484b03c94834 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 15 Jan 2021 15:24:59 -0600 Subject: of: Stop circularly including of_device.h and of_platform.h The DT of_device.h and of_platform.h headers date back to the separate of_platform_bus_type before it was merged into the regular platform bus. As part of that merge prepping Arm DT support 13 years ago, they "temporarily" include each other. The headers also include platform_device.h and of.h. The result was lots of drivers relied on these implicit includes. Now the entire tree has been fixed over the last couple of cycles to explicitly include the necessary headers instead of relying on of_device.h and/or of_platform.h implicit includes, so the implicit and circular includes can finally be removed. Signed-off-by: Rob Herring --- include/linux/of_device.h | 5 +---- include/linux/of_platform.h | 4 ++-- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_device.h b/include/linux/of_device.h index a72661e47faa5..9042bca5bb848 100644 --- a/include/linux/of_device.h +++ b/include/linux/of_device.h @@ -2,10 +2,7 @@ #ifndef _LINUX_OF_DEVICE_H #define _LINUX_OF_DEVICE_H -#include -#include /* temporary until merge */ - -#include +#include struct device; struct of_device_id; diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h index fadfea5754852..a2ff1ad48f7f0 100644 --- a/include/linux/of_platform.h +++ b/include/linux/of_platform.h @@ -7,11 +7,11 @@ */ #include -#include -#include struct device; +struct device_node; struct of_device_id; +struct platform_device; /** * struct of_dev_auxdata - lookup table entry for device names & platform_data -- cgit v1.2.3 From d26270061ae66b915138af7cd73ca6f8b85e6b44 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 18 Jan 2024 12:31:55 -0800 Subject: string: Remove strlcpy() With all the users of strlcpy() removed[1] from the kernel, remove the API, self-tests, and other references. Leave mentions in Documentation (about its deprecation), and in checkpatch.pl (to help migrate host-only tools/ usage). Long live strscpy(). Link: https://github.com/KSPP/linux/issues/89 [1] Cc: Azeem Shaikh Cc: Andrew Morton Cc: Andy Whitcroft Cc: Joe Perches Cc: Dwaipayan Ray Cc: Lukas Bulwahn Cc: linux-hardening@vger.kernel.org Reviewed-by: Andy Shevchenko Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 51 ------------------------------------------ include/linux/string.h | 3 --- 2 files changed, 54 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 79ef6ac4c0211..89a6888f2f9e5 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -214,51 +214,6 @@ __kernel_size_t __fortify_strlen(const char * const POS p) return ret; } -/* Defined after fortified strlen() to reuse it. */ -extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); -/** - * strlcpy - Copy a string into another string buffer - * - * @p: pointer to destination of copy - * @q: pointer to NUL-terminated source string to copy - * @size: maximum number of bytes to write at @p - * - * If strlen(@q) >= @size, the copy of @q will be truncated at - * @size - 1 bytes. @p will always be NUL-terminated. - * - * Do not use this function. While FORTIFY_SOURCE tries to avoid - * over-reads when calculating strlen(@q), it is still possible. - * Prefer strscpy(), though note its different return values for - * detecting truncation. - * - * Returns total number of bytes written to @p, including terminating NUL. - * - */ -__FORTIFY_INLINE size_t strlcpy(char * const POS p, const char * const POS q, size_t size) -{ - const size_t p_size = __member_size(p); - const size_t q_size = __member_size(q); - size_t q_len; /* Full count of source string length. */ - size_t len; /* Count of characters going into destination. */ - - if (p_size == SIZE_MAX && q_size == SIZE_MAX) - return __real_strlcpy(p, q, size); - q_len = strlen(q); - len = (q_len >= size) ? size - 1 : q_len; - if (__builtin_constant_p(size) && __builtin_constant_p(q_len) && size) { - /* Write size is always larger than destination. */ - if (len >= p_size) - __write_overflow(); - } - if (size) { - if (len >= p_size) - fortify_panic(__func__); - __underlying_memcpy(p, q, len); - p[len] = '\0'; - } - return q_len; -} - /* Defined after fortified strnlen() to reuse it. */ extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy); /** @@ -272,12 +227,6 @@ extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy); * @p buffer. The behavior is undefined if the string buffers overlap. The * destination @p buffer is always NUL terminated, unless it's zero-sized. * - * Preferred to strlcpy() since the API doesn't require reading memory - * from the source @q string beyond the specified @size bytes, and since - * the return value is easier to error-check than strlcpy()'s. - * In addition, the implementation is robust to the string changing out - * from underneath it, unlike the current strlcpy() implementation. - * * Preferred to strncpy() since it always returns a valid string, and * doesn't unnecessarily force the tail of the destination buffer to be * zero padded. If padding is desired please use strscpy_pad(). diff --git a/include/linux/string.h b/include/linux/string.h index ce137830a0b99..ab148d8dbfc14 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -66,9 +66,6 @@ extern char * strcpy(char *,const char *); #ifndef __HAVE_ARCH_STRNCPY extern char * strncpy(char *,const char *, __kernel_size_t); #endif -#ifndef __HAVE_ARCH_STRLCPY -size_t strlcpy(char *, const char *, size_t); -#endif #ifndef __HAVE_ARCH_STRSCPY ssize_t strscpy(char *, const char *, size_t); #endif -- cgit v1.2.3