From bc0fcb9823cd0894934cf968b525c575833d7078 Mon Sep 17 00:00:00 2001 From: Yilin Zhu Date: Sun, 12 Apr 2026 13:07:54 +0800 Subject: ipv6: xfrm6: release dst on error in xfrm6_rcv_encap() xfrm6_rcv_encap() performs an IPv6 route lookup when the skb does not already have a dst attached. ip6_route_input_lookup() returns a referenced dst entry even when the lookup resolves to an error route. If dst->error is set, xfrm6_rcv_encap() drops the skb without attaching the dst to the skb and without releasing the reference returned by the lookup. Repeated packets hitting this path therefore leak dst entries. Release the dst before jumping to the drop path. Fixes: 0146dca70b87 ("xfrm: add support for UDPv6 encapsulation of ESP") Cc: stable@kernel.org Reported-by: Yifan Wu Reported-by: Juefei Pu Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Suggested-by: Xin Liu Tested-by: Ruide Cao Signed-off-by: Yilin Zhu Signed-off-by: Ren Wei Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_protocol.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c index ea2f805d3b01..9b586fcec485 100644 --- a/net/ipv6/xfrm6_protocol.c +++ b/net/ipv6/xfrm6_protocol.c @@ -88,8 +88,10 @@ int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, dst = ip6_route_input_lookup(dev_net(skb->dev), skb->dev, &fl6, skb, flags); - if (dst->error) + if (dst->error) { + dst_release(dst); goto drop; + } skb_dst_set(skb, dst); } -- cgit v1.2.3 From ec54093e6a8f87e800bb6aa15eb7fc1e33faa524 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Sun, 19 Apr 2026 18:35:42 -0400 Subject: xfrm: ah: account for ESN high bits in async callbacks AH allocates its temporary auth/ICV layout differently when ESN is enabled: the async ahash setup appends a 4-byte seqhi slot before the ICV or auth_data area, but the async completion callbacks still reconstruct the temporary layout as if seqhi were absent. With an async AH implementation selected, that makes AH copy or compare the wrong bytes on both the IPv4 and IPv6 paths. In UML repro on IPv4 AH with ESN and forced async hmac(sha1), ping fails with 100% packet loss, and the callback logs show the pre-fix drift: ah4 output_done: esn=1 err=0 icv_off=20 expected_off=24 ah4 input_done: esn=1 auth_off=20 expected_auth_off=24 icv_off=32 expected_icv_off=36 Reconstruct the callback-side layout the same way the setup path built it by skipping the ESN seqhi slot before locating the saved auth_data or ICV. Per RFC 4302, the ESN high-order 32 bits participate in the AH ICV computation, so the async callbacks must account for the seqhi slot. Post-fix, the same IPv4 AH+ESN+forced-async-hmac(sha1) UML repro shows the corrected offset (ah4 output_done: esn=1 err=0 icv_off=24 expected_off=24) and ping succeeds; net/ipv4/ah4.o and net/ipv6/ah6.o build clean at W=1. IPv6 AH+ESN was not exercised at runtime, and the change has not been tested against a real async hardware AH engine. Fixes: d4d573d0334d ("{IPv4,xfrm} Add ESN support for AH egress part") Fixes: d8b2a8600b0e ("{IPv4,xfrm} Add ESN support for AH ingress part") Fixes: 26dd70c3fad3 ("{IPv6,xfrm} Add ESN support for AH egress part") Fixes: 8d6da6f32557 ("{IPv6,xfrm} Add ESN support for AH ingress part") Cc: stable@vger.kernel.org Assisted-by: Codex:gpt-5-4 Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Michael Bommarito Signed-off-by: Steffen Klassert --- net/ipv4/ah4.c | 14 ++++++++++++-- net/ipv6/ah6.c | 14 ++++++++++++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 5fb812443a08..4366cbac3f06 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -124,9 +124,14 @@ static void ah_output_done(void *data, int err) struct iphdr *top_iph = ip_hdr(skb); struct ip_auth_hdr *ah = ip_auth_hdr(skb); int ihl = ip_hdrlen(skb); + int seqhi_len = 0; + __be32 *seqhi; + if (x->props.flags & XFRM_STATE_ESN) + seqhi_len = sizeof(*seqhi); iph = AH_SKB_CB(skb)->tmp; - icv = ah_tmp_icv(iph, ihl); + seqhi = (__be32 *)((char *)iph + ihl); + icv = ah_tmp_icv(seqhi, seqhi_len); memcpy(ah->auth_data, icv, ahp->icv_trunc_len); top_iph->tos = iph->tos; @@ -270,12 +275,17 @@ static void ah_input_done(void *data, int err) struct ip_auth_hdr *ah = ip_auth_hdr(skb); int ihl = ip_hdrlen(skb); int ah_hlen = (ah->hdrlen + 2) << 2; + int seqhi_len = 0; + __be32 *seqhi; if (err) goto out; + if (x->props.flags & XFRM_STATE_ESN) + seqhi_len = sizeof(*seqhi); work_iph = AH_SKB_CB(skb)->tmp; - auth_data = ah_tmp_auth(work_iph, ihl); + seqhi = (__be32 *)((char *)work_iph + ihl); + auth_data = ah_tmp_auth(seqhi, seqhi_len); icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len); err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index cb26beea4398..de1e68199a01 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -317,14 +317,19 @@ static void ah6_output_done(void *data, int err) struct ipv6hdr *top_iph = ipv6_hdr(skb); struct ip_auth_hdr *ah = ip_auth_hdr(skb); struct tmp_ext *iph_ext; + int seqhi_len = 0; + __be32 *seqhi; extlen = skb_network_header_len(skb) - sizeof(struct ipv6hdr); if (extlen) extlen += sizeof(*iph_ext); + if (x->props.flags & XFRM_STATE_ESN) + seqhi_len = sizeof(*seqhi); iph_base = AH_SKB_CB(skb)->tmp; iph_ext = ah_tmp_ext(iph_base); - icv = ah_tmp_icv(iph_ext, extlen); + seqhi = (__be32 *)((char *)iph_ext + extlen); + icv = ah_tmp_icv(seqhi, seqhi_len); memcpy(ah->auth_data, icv, ahp->icv_trunc_len); memcpy(top_iph, iph_base, IPV6HDR_BASELEN); @@ -471,13 +476,18 @@ static void ah6_input_done(void *data, int err) struct ip_auth_hdr *ah = ip_auth_hdr(skb); int hdr_len = skb_network_header_len(skb); int ah_hlen = ipv6_authlen(ah); + int seqhi_len = 0; + __be32 *seqhi; if (err) goto out; + if (x->props.flags & XFRM_STATE_ESN) + seqhi_len = sizeof(*seqhi); work_iph = AH_SKB_CB(skb)->tmp; auth_data = ah_tmp_auth(work_iph, hdr_len); - icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len); + seqhi = (__be32 *)(auth_data + ahp->icv_trunc_len); + icv = ah_tmp_icv(seqhi, seqhi_len); err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; if (err) -- cgit v1.2.3 From 4a1b534177395627579c1fb9e7f9100ee88955dd Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Tue, 10 Feb 2026 11:07:31 +0800 Subject: wifi: ath12k: prepare REO update element only for primary link Commit [1] introduces dp->reo_cmd_update_rx_queue_list for the purpose of tracking all pending REO queue flush commands. The helper ath12k_dp_prepare_reo_update_elem() allocates an element and populates it with REO queue information, then add it to the list. The element would be helpful during clean up stage to finally unmap/free the corresponding REO queue buffer. In MLO scenarios with more than one links, for non dp_primary_link_only chips like WCN7850, that helper is called for each link peer. This results in multiple elements added to the list but all of them pointing to the same REO queue buffer. Consequently the same buffer gets unmap/freed multiple times: BUG kmalloc-2k (Tainted: G B W O ): Object already free ----------------------------------------------------------------------------- Allocated in ath12k_wifi7_dp_rx_assign_reoq+0xce/0x280 [ath12k_wifi7] age=7436 cpu=10 pid=16130 __kmalloc_noprof ath12k_wifi7_dp_rx_assign_reoq ath12k_dp_rx_peer_tid_setup ath12k_dp_peer_setup ath12k_mac_station_add ath12k_mac_op_sta_state [...] Freed in ath12k_dp_rx_tid_cleanup.part.0+0x25/0x40 [ath12k] age=1 cpu=27 pid=16137 kfree ath12k_dp_rx_tid_cleanup.part.0 ath12k_dp_rx_reo_cmd_list_cleanup ath12k_dp_cmn_device_deinit ath12k_core_stop ath12k_core_hw_group_cleanup ath12k_pci_remove Fix this by allowing list addition for primary link only. Note dp_primary_link_only chips like QCN9274 are not affected by this change, because that's what they were doing in the first place. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00302-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1.115823.3 Fixes: 3bf2e57e7d6c ("wifi: ath12k: Add Retry Mechanism for REO RX Queue Update Failures") # [1] Closes: https://bugzilla.kernel.org/show_bug.cgi?id=221011 Signed-off-by: Baochen Qiang Reviewed-by: Vasanthakumar Thiagarajan Link: https://patch.msgid.link/20260210-ath12k-rxtid-double-free-v1-1-8b523fb2886d@oss.qualcomm.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/dp_rx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/wireless/ath/ath12k/dp_rx.c b/drivers/net/wireless/ath/ath12k/dp_rx.c index 250459facff3..25557dea5826 100644 --- a/drivers/net/wireless/ath/ath12k/dp_rx.c +++ b/drivers/net/wireless/ath/ath12k/dp_rx.c @@ -565,6 +565,9 @@ static int ath12k_dp_prepare_reo_update_elem(struct ath12k_dp *dp, lockdep_assert_held(&dp->dp_lock); + if (!peer->primary_link) + return 0; + elem = kzalloc_obj(*elem, GFP_ATOMIC); if (!elem) return -ENOMEM; -- cgit v1.2.3 From f3ba9e05cc7b65f41f58bb4808f6c3a8f7894bb1 Mon Sep 17 00:00:00 2001 From: Aaradhana Sahu Date: Fri, 10 Apr 2026 12:43:00 +0530 Subject: wifi: ath12k: fix OF node refcount imbalance in WSI graph traversal ath12k_core_get_wsi_info() traverses the WSI (Wired Serial Interface) device graph starting from dev->of_node. The current code uses dev->of_node directly as the local traversal pointer and calls of_node_put() on error. Since the driver does not own a reference to dev->of_node, dropping it during traversal results in the following OF refcount underflow: OF: ERROR: of_node_release() detected bad of_node_put() on /soc@0/wifi@c000000 CPU: 1 UID: 0 PID: 210 Comm: insmod Not tainted 6.19.0-rc4-next-20260109-00023-g797dd36dc178 #26 PREEMPT Hardware name: Qualcomm Technologies, Inc. IPQ5332 MI01.2 (DT) Call trace: show_stack+0x18/0x24 (C) dump_stack_lvl+0x60/0x80 dump_stack+0x18/0x24 of_node_release+0x164/0x1a0 kobject_put+0xb4/0x278 of_node_put+0x18/0x28 ath12k_core_init+0x29c/0x5d4 [ath12k] ath12k_ahb_probe+0x950/0xc14 [ath12k] platform_probe+0x5c/0xa4 really_probe+0xc0/0x3ec __driver_probe_device+0x80/0x170 driver_probe_device+0x3c/0x120 __driver_attach+0xc4/0x218 OF: ERROR: next of_node_put() on this node will result in a kobject warning 'refcount_t: underflow; use-after-free.' Fix this by explicitly acquiring a reference to the starting node using of_node_get() and attaching automatic cleanup via __free(device_node). Each discovered WSI node is stored in ag->wsi_node[] with its own of_node_get() reference. These references are later released in ath12k_core_free_wsi_info() during driver teardown. Also remove unnecessary memset() of wsi_node array since cleanup now explicitly sets pointers to NULL. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.6-01243-QCAHKSWPL_SILICONZ-1 Tested-on: IPQ5332 hw1.0 AHB WLAN.WBE.1.6-01275-QCAHKSWPL_SILICONZ-1 Fixes: 908c10c860e0 ("wifi: ath12k: parse multiple device information from Device Tree") Signed-off-by: Aaradhana Sahu Reviewed-by: Rameshkumar Sundaram Reviewed-by: Baochen Qiang Link: https://patch.msgid.link/20260410071300.2323603-1-aaradhana.sahu@oss.qualcomm.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/core.c | 77 +++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 29 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/core.c b/drivers/net/wireless/ath/ath12k/core.c index 2519e2400d58..980a12fb2c6e 100644 --- a/drivers/net/wireless/ath/ath12k/core.c +++ b/drivers/net/wireless/ath/ath12k/core.c @@ -1838,10 +1838,22 @@ static struct ath12k_hw_group *ath12k_core_hw_group_alloc(struct ath12k_base *ab return ag; } +static void ath12k_core_free_wsi_info(struct ath12k_hw_group *ag) +{ + int i; + + for (i = 0; i < ag->num_devices; i++) { + of_node_put(ag->wsi_node[i]); + ag->wsi_node[i] = NULL; + } + ag->num_devices = 0; +} + static void ath12k_core_hw_group_free(struct ath12k_hw_group *ag) { mutex_lock(&ath12k_hw_group_mutex); + ath12k_core_free_wsi_info(ag); list_del(&ag->list); kfree(ag); @@ -1867,52 +1879,59 @@ static struct ath12k_hw_group *ath12k_core_hw_group_find_by_dt(struct ath12k_bas static int ath12k_core_get_wsi_info(struct ath12k_hw_group *ag, struct ath12k_base *ab) { - struct device_node *wsi_dev = ab->dev->of_node, *next_wsi_dev; - struct device_node *tx_endpoint, *next_rx_endpoint; - int device_count = 0; - - next_wsi_dev = wsi_dev; + struct device_node *next_wsi_dev; + int device_count = 0, ret = 0; + struct device_node *wsi_dev; - if (!next_wsi_dev) + wsi_dev = of_node_get(ab->dev->of_node); + if (!wsi_dev) return -ENODEV; do { - ag->wsi_node[device_count] = next_wsi_dev; + if (device_count >= ATH12K_MAX_DEVICES) { + ath12k_warn(ab, "device count in DT %d is more than limit %d\n", + device_count, ATH12K_MAX_DEVICES); + ret = -EINVAL; + break; + } + + ag->wsi_node[device_count++] = of_node_get(wsi_dev); - tx_endpoint = of_graph_get_endpoint_by_regs(next_wsi_dev, 0, -1); + struct device_node *tx_endpoint __free(device_node) = + of_graph_get_endpoint_by_regs(wsi_dev, 0, -1); if (!tx_endpoint) { - of_node_put(next_wsi_dev); - return -ENODEV; + ret = -ENODEV; + break; } - next_rx_endpoint = of_graph_get_remote_endpoint(tx_endpoint); + struct device_node *next_rx_endpoint __free(device_node) = + of_graph_get_remote_endpoint(tx_endpoint); if (!next_rx_endpoint) { - of_node_put(next_wsi_dev); - of_node_put(tx_endpoint); - return -ENODEV; + ret = -ENODEV; + break; } - of_node_put(tx_endpoint); - of_node_put(next_wsi_dev); - next_wsi_dev = of_graph_get_port_parent(next_rx_endpoint); if (!next_wsi_dev) { - of_node_put(next_rx_endpoint); - return -ENODEV; + ret = -ENODEV; + break; } - of_node_put(next_rx_endpoint); + of_node_put(wsi_dev); + wsi_dev = next_wsi_dev; + } while (ab->dev->of_node != wsi_dev); - device_count++; - if (device_count > ATH12K_MAX_DEVICES) { - ath12k_warn(ab, "device count in DT %d is more than limit %d\n", - device_count, ATH12K_MAX_DEVICES); - of_node_put(next_wsi_dev); - return -EINVAL; + if (ret) { + while (--device_count >= 0) { + of_node_put(ag->wsi_node[device_count]); + ag->wsi_node[device_count] = NULL; } - } while (wsi_dev != next_wsi_dev); - of_node_put(next_wsi_dev); + of_node_put(wsi_dev); + return ret; + } + + of_node_put(wsi_dev); ag->num_devices = device_count; return 0; @@ -1983,9 +2002,9 @@ static struct ath12k_hw_group *ath12k_core_hw_group_assign(struct ath12k_base *a ath12k_core_get_wsi_index(ag, ab)) { ath12k_dbg(ab, ATH12K_DBG_BOOT, "unable to get wsi info from dt, grouping single device"); + ath12k_core_free_wsi_info(ag); ag->id = ATH12K_INVALID_GROUP_ID; ag->num_devices = 1; - memset(ag->wsi_node, 0, sizeof(ag->wsi_node)); wsi->index = 0; } -- cgit v1.2.3 From c4b6ad0e14f5df942eed5ebadaff84b468bd2496 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sat, 18 Apr 2026 22:37:00 +0300 Subject: wifi: ath10k: snoc: select POWER_SEQUENCING The commit afcf3ec615c9 ("wifi: ath10k: snoc: support powering on the device via pwrseq") made ath10k SNOC driver use devm_pwrseq_get(). Select the corresponding Kconfig symbol to make sure that API call is always available and doesn't return an error per se. Fixes: afcf3ec615c9 ("wifi: ath10k: snoc: support powering on the device via pwrseq") Reported-by: Luca Weiss Closes: https://lore.kernel.org/r/DHUHU7UIT487.139L3KIVRVREU@fairphone.com Signed-off-by: Dmitry Baryshkov Reviewed-by: Rameshkumar Sundaram Link: https://patch.msgid.link/20260418-ath10k-snoc-pwrseq-v1-1-832594ba3294@oss.qualcomm.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath10k/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/ath/ath10k/Kconfig b/drivers/net/wireless/ath/ath10k/Kconfig index 876aed765833..efb9f022d8c6 100644 --- a/drivers/net/wireless/ath/ath10k/Kconfig +++ b/drivers/net/wireless/ath/ath10k/Kconfig @@ -46,6 +46,7 @@ config ATH10K_SNOC depends on ARCH_QCOM || COMPILE_TEST depends on QCOM_SMEM depends on QCOM_RPROC_COMMON || QCOM_RPROC_COMMON=n + select POWER_SEQUENCING select QCOM_SCM select QCOM_QMI_HELPERS help -- cgit v1.2.3 From 4498664e2d5888efabb96428196a926acdaa25ed Mon Sep 17 00:00:00 2001 From: Yu-Hsiang Tseng Date: Thu, 23 Apr 2026 02:08:14 +0800 Subject: wifi: ath12k: use lockdep_assert_in_rcu_read_lock() for RCU assertions Two functions in ath12k assert that the caller holds an RCU read lock: ath12k_mac_get_arvif() and ath12k_p2p_noa_update_vdev_iter(). Both use: WARN_ON(!rcu_read_lock_any_held()); On kernels using preemptible RCU (CONFIG_PREEMPT=y or CONFIG_PREEMPT_RT=y) without CONFIG_DEBUG_LOCK_ALLOC, this produces a false positive splat whenever these functions are invoked from paths that do hold the RCU read lock (e.g. firmware stats processing or mac80211 interface iteration). Root cause: - Without CONFIG_DEBUG_LOCK_ALLOC, rcu_read_lock_any_held() is a static inline that returns !preemptible() as a proxy for "in an RCU read section". - With preemptible RCU, rcu_read_lock() does not disable preemption. A task can therefore be preemptible while legitimately holding an RCU read lock, making the proxy unreliable. - Callers such as ath12k_wmi_tlv_rssi_chain_parse() (via guard(rcu)()) and ieee80211_iterate_active_interfaces_atomic() do hold the RCU read lock, so these warnings are incorrect. Typical splat seen on a WCN7850 station with periodic fw stats processing: WARNING: drivers/net/wireless/ath/ath12k/mac.c:791 at ath12k_mac_get_arvif+0x9e/0xd0 [ath12k] Tainted: G W O 6.19.13-rt #1 PREEMPT_RT Call Trace: ath12k_wmi_tlv_rssi_chain_parse+0x69/0x170 [ath12k] ath12k_wmi_tlv_iter+0x7f/0x120 [ath12k] ath12k_wmi_tlv_fw_stats_parse+0x342/0x6b0 [ath12k] ath12k_wmi_op_rx+0xe9e/0x3150 [ath12k] ath12k_htc_rx_completion_handler+0x3df/0x5b0 [ath12k] ath12k_ce_per_engine_service+0x325/0x3e0 [ath12k] ath12k_pci_ce_workqueue+0x20/0x40 [ath12k] Replace WARN_ON(!rcu_read_lock_any_held()) with lockdep_assert_in_rcu_read_lock(), which is gated on CONFIG_PROVE_RCU and therefore compiles out entirely when PROVE_RCU is disabled. PROVE_RCU kernels continue to get the full lockdep-based check, and the new helper precisely checks for rcu_read_lock() rather than any RCU variant, which better matches the callers' expectations. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00302-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1.115823.3 Fixes: 3dd2c68f206e ("wifi: ath12k: prepare vif data structure for MLO handling") Suggested-by: Baochen Qiang Suggested-by: Sebastian Andrzej Siewior Reviewed-by: Baochen Qiang Reviewed-by: Rameshkumar Sundaram Signed-off-by: Yu-Hsiang Tseng Reviewed-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20260422180814.1938317-1-asas1asas200@gmail.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/mac.c | 2 +- drivers/net/wireless/ath/ath12k/p2p.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index fbdfe6424fd7..df2334f3bad6 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -788,7 +788,7 @@ struct ath12k_link_vif *ath12k_mac_get_arvif(struct ath12k *ar, u32 vdev_id) /* To use the arvif returned, caller must have held rcu read lock. */ - WARN_ON(!rcu_read_lock_any_held()); + lockdep_assert_in_rcu_read_lock(); arvif_iter.vdev_id = vdev_id; arvif_iter.ar = ar; diff --git a/drivers/net/wireless/ath/ath12k/p2p.c b/drivers/net/wireless/ath/ath12k/p2p.c index 59589748f1a8..19ebcd1d8eb2 100644 --- a/drivers/net/wireless/ath/ath12k/p2p.c +++ b/drivers/net/wireless/ath/ath12k/p2p.c @@ -123,7 +123,7 @@ static void ath12k_p2p_noa_update_vdev_iter(void *data, u8 *mac, struct ath12k_p2p_noa_arg *arg = data; struct ath12k_link_vif *arvif; - WARN_ON(!rcu_read_lock_any_held()); + lockdep_assert_in_rcu_read_lock(); arvif = &ahvif->deflink; if (!arvif->is_created || arvif->ar != arg->ar || arvif->vdev_id != arg->vdev_id) return; -- cgit v1.2.3 From 711a9c018ad252b2807f85d44e1267b595644f9b Mon Sep 17 00:00:00 2001 From: Rio Liu Date: Wed, 15 Apr 2026 16:57:13 +0000 Subject: wifi: mac80211: skip ieee80211_verify_sta_ht_mcs_support check in non-strict mode Some Xfinity XB8 firmware advertises >1 spatial stream MCS indexes in their basic HT-MCS set. On cards with lower spatial streams, the check would fail, and we'd be stuck with no HT when in fact work fine with its own supported rate. This change makes it so the check is only performed in strict mode. Fixes: 574faa0e936d ("wifi: mac80211: add HT and VHT basic set verification") Signed-off-by: Rio Liu Link: https://patch.msgid.link/99Mv9QEceyPrQhSP52MtAVmz0_kWJmzqotJjD9YW6LGLqk-AZloAueUyHCURilFkuqOh6Ecv8i2KKdSE1ujP3AnbU5QEouVisT1w_V3xdfc=@r26.me Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 160ae65a5c64..298ebff6bbf8 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -437,6 +437,15 @@ ieee80211_verify_sta_ht_mcs_support(struct ieee80211_sub_if_data *sdata, memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); + /* + * Some Xfinity XB8 firmware advertises >1 spatial stream MCS indexes in + * their basic HT-MCS set. On cards with lower spatial streams, the check + * would fail, and we'd be stuck with no HT when it in fact work fine with + * its own supported rate. So check it only in strict mode. + */ + if (!ieee80211_hw_check(&sdata->local->hw, STRICT)) + return true; + /* * P802.11REVme/D7.0 - 6.5.4.2.4 * ... -- cgit v1.2.3 From c623b63580880cc742255eaed3d79804c1b91143 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Thu, 16 Apr 2026 11:33:39 +0200 Subject: wifi: brcmfmac: Fix potential use-after-free issue when stopping watchdog task Watchdog task might end between send_sig() and kthread_stop() calls, what results in the use-after-free issue. Fix this by increasing watchdog task reference count before calling send_sig() and dropping it by switching to kthread_stop_put(). Cc: stable@vger.kernel.org Fixes: 373c83a801f1 ("brcmfmac: stop watchdog before detach and free everything") Fixes: a9ffda88be74 ("brcm80211: fmac: abstract bus_stop interface function pointer") Signed-off-by: Marek Szyprowski Acked-by: Arend van Spriel Link: https://patch.msgid.link/20260416093339.2066829-1-m.szyprowski@samsung.com Signed-off-by: Johannes Berg --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c index 30f6fcb68632..8fb595733b9c 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c @@ -2476,8 +2476,9 @@ static void brcmf_sdio_bus_stop(struct device *dev) brcmf_dbg(TRACE, "Enter\n"); if (bus->watchdog_tsk) { + get_task_struct(bus->watchdog_tsk); send_sig(SIGTERM, bus->watchdog_tsk, 1); - kthread_stop(bus->watchdog_tsk); + kthread_stop_put(bus->watchdog_tsk); bus->watchdog_tsk = NULL; } @@ -4567,8 +4568,9 @@ void brcmf_sdio_remove(struct brcmf_sdio *bus) if (bus) { /* Stop watchdog task */ if (bus->watchdog_tsk) { + get_task_struct(bus->watchdog_tsk); send_sig(SIGTERM, bus->watchdog_tsk, 1); - kthread_stop(bus->watchdog_tsk); + kthread_stop_put(bus->watchdog_tsk); bus->watchdog_tsk = NULL; } -- cgit v1.2.3 From 1f4f78bf8549e6ac4f04fba4176854f3a6e0c332 Mon Sep 17 00:00:00 2001 From: Tristan Madani Date: Fri, 17 Apr 2026 11:11:44 +0000 Subject: wifi: b43: enforce bounds check on firmware key index in b43_rx() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The firmware-controlled key index in b43_rx() can exceed the dev->key[] array size (58 entries). The existing B43_WARN_ON is non-enforcing in production builds, allowing an out-of-bounds read. Make the B43_WARN_ON check enforcing by dropping the frame when the firmware returns an invalid key index. Suggested-by: Jonas Gorski Acked-by: Michael Büsch Fixes: e4d6b7951812 ("[B43]: add mac80211-based driver for modern BCM43xx devices") Cc: stable@vger.kernel.org Signed-off-by: Tristan Madani Link: https://patch.msgid.link/20260417111145.2694196-1-tristmd@gmail.com Signed-off-by: Johannes Berg --- drivers/net/wireless/broadcom/b43/xmit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/broadcom/b43/xmit.c b/drivers/net/wireless/broadcom/b43/xmit.c index 7651b1bdb592..f0b082596637 100644 --- a/drivers/net/wireless/broadcom/b43/xmit.c +++ b/drivers/net/wireless/broadcom/b43/xmit.c @@ -702,7 +702,8 @@ void b43_rx(struct b43_wldev *dev, struct sk_buff *skb, const void *_rxhdr) * key index, but the ucode passed it slightly different. */ keyidx = b43_kidx_to_raw(dev, keyidx); - B43_WARN_ON(keyidx >= ARRAY_SIZE(dev->key)); + if (B43_WARN_ON(keyidx >= ARRAY_SIZE(dev->key))) + goto drop; if (dev->key[keyidx].algorithm != B43_SEC_ALGO_NONE) { wlhdr_len = ieee80211_hdrlen(fctl); -- cgit v1.2.3 From a035766f970bde2d4298346a31a80685be5c0205 Mon Sep 17 00:00:00 2001 From: Tristan Madani Date: Fri, 17 Apr 2026 11:11:45 +0000 Subject: wifi: b43legacy: enforce bounds check on firmware key index in RX path Same fix as b43: the firmware-controlled key index in b43legacy_rx() can exceed dev->max_nr_keys. The existing B43legacy_WARN_ON is non-enforcing in production builds, allowing an out-of-bounds read of dev->key[]. Make the check enforcing by dropping the frame for invalid indices. Fixes: 75388acd0cd8 ("[B43LEGACY]: add mac80211-based driver for legacy BCM43xx devices") Cc: stable@vger.kernel.org Signed-off-by: Tristan Madani Link: https://patch.msgid.link/20260417111145.2694196-2-tristmd@gmail.com Signed-off-by: Johannes Berg --- drivers/net/wireless/broadcom/b43legacy/xmit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/broadcom/b43legacy/xmit.c b/drivers/net/wireless/broadcom/b43legacy/xmit.c index efd63f4ce74f..ee199d4eaf03 100644 --- a/drivers/net/wireless/broadcom/b43legacy/xmit.c +++ b/drivers/net/wireless/broadcom/b43legacy/xmit.c @@ -476,7 +476,8 @@ void b43legacy_rx(struct b43legacy_wldev *dev, * key index, but the ucode passed it slightly different. */ keyidx = b43legacy_kidx_to_raw(dev, keyidx); - B43legacy_WARN_ON(keyidx >= dev->max_nr_keys); + if (B43legacy_WARN_ON(keyidx >= dev->max_nr_keys)) + goto drop; if (dev->key[keyidx].algorithm != B43legacy_SEC_ALGO_NONE) { /* Remove PROTECTED flag to mark it as decrypted. */ -- cgit v1.2.3 From 3994b4afd521d60e47e012fe2ed7b606aaec370b Mon Sep 17 00:00:00 2001 From: Amir Mohammad Jahangirzad Date: Sat, 18 Apr 2026 04:12:47 +0330 Subject: wifi: libertas: fix integer underflow in process_cmdrequest() The existing validation only checks if recvlength exceeds LBS_CMD_BUFFER_SIZE, but doesn't check the lower bound. When a USB device sends a response shorter than MESSAGE_HEADER_LEN, the subtraction (recvlength - MESSAGE_HEADER_LEN) wraps to a huge value, causing memcpy to corrupt the heap. Add the same lower bound check that libertas_tf already has. Signed-off-by: Amir Mohammad Jahangirzad Link: https://patch.msgid.link/20260418004247.368944-1-a.jahangirzad@gmail.com Signed-off-by: Johannes Berg --- drivers/net/wireless/marvell/libertas/if_usb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/marvell/libertas/if_usb.c b/drivers/net/wireless/marvell/libertas/if_usb.c index 4fae0e335136..a00d53350fa9 100644 --- a/drivers/net/wireless/marvell/libertas/if_usb.c +++ b/drivers/net/wireless/marvell/libertas/if_usb.c @@ -633,9 +633,10 @@ static inline void process_cmdrequest(int recvlength, uint8_t *recvbuff, unsigned long flags; u8 i; - if (recvlength > LBS_CMD_BUFFER_SIZE) { + if (recvlength < MESSAGE_HEADER_LEN || + recvlength > LBS_CMD_BUFFER_SIZE) { lbs_deb_usbd(&cardp->udev->dev, - "The receive buffer is too large\n"); + "The receive buffer is invalid: %d\n", recvlength); kfree_skb(skb); return; } -- cgit v1.2.3 From 381cd547bc6e35a610c5dfebe554d891eea40f03 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Tue, 21 Apr 2026 18:45:52 -0400 Subject: wifi: nl80211: require admin perm on SET_PMK / DEL_PMK NL80211_CMD_SET_PMK and NL80211_CMD_DEL_PMK manage the offloaded 4-way-handshake PMK state used by drivers advertising NL80211_EXT_FEATURE_4WAY_HANDSHAKE_STA_1X. The only in-tree driver that wires up both ->set_pmk / ->del_pmk and advertises the feature today is brcmfmac, so the practical reach of this patch is narrow. Both ops were introduced without a .flags gate, so the generic netlink layer dispatches them to an unprivileged caller instead of rejecting with -EPERM at the permission check. Every other connection-state op in the adjacent block (CONNECT, ASSOCIATE, AUTHENTICATE, SET_KEY, ...) carries GENL_UNS_ADMIN_PERM; SET_PMK / DEL_PMK were introduced without the flag in 2017 and left unchanged by later refactors. Johannes checked the original Intel submission history and confirmed there is no admin check in any prior revision either, so this seems likely to be a simple oversight rather than an intentional carve-out. Require GENL_UNS_ADMIN_PERM so the genl layer performs the same capable(CAP_NET_ADMIN) check as its siblings. wpa_supplicant already needs CAP_NET_ADMIN for every other nl80211 op it issues, so supplicant operation is unaffected. The worst case the missing gate enables today is an unprivileged local process on a multi-user system invalidating the offloaded PMK state of another user's 4-way-handshake session, forcing a full EAP re-auth on the next reconnect. Verified in UML: an unprivileged probe (uid=1000) sees SET_MULTICAST_TO_UNICAST (sibling op with GENL_UNS_ADMIN_PERM) return -EPERM on both pre- and post-fix kernels, while SET_PMK / DEL_PMK return -ENODEV from nl80211_pre_doit()'s wdev lookup pre- fix (proving dispatch crossed the genl permission check) and -EPERM post-fix (rejected at the genl layer as intended). Suggested-by: Johannes Berg Fixes: 3a00df5707b6 ("cfg80211: support 4-way handshake offloading for 802.1X") Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Michael Bommarito Acked-by: Arend van Spriel Link: https://patch.msgid.link/20260421224552.4044147-1-michael.bommarito@gmail.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f334cdef8958..67088804dcc7 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -19828,6 +19828,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .cmd = NL80211_CMD_SET_PMK, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_pmk, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_CLEAR_SKB), }, @@ -19835,6 +19836,7 @@ static const struct genl_small_ops nl80211_small_ops[] = { .cmd = NL80211_CMD_DEL_PMK, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_del_pmk, + .flags = GENL_UNS_ADMIN_PERM, .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, { -- cgit v1.2.3 From 9b55d5c1f5e481e391957f9096d798ca331c461b Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Tue, 21 Apr 2026 20:06:51 -0400 Subject: wifi: mac80211: check ieee80211_rx_data_set_link return in pubsta MLO path __ieee80211_rx_handle_packet() resolves the link via ieee80211_rx_data_set_link() on the pubsta->mlo path but ignores the helper's return value. Inside the helper, rx->link = rcu_dereference(rx->sdata->link[link_id]); can leave rx->link NULL if link_id references a slot already cleared by ieee80211_vif_set_links() during station-initiated ML reconfiguration (see mlme.c's ieee80211_ml_reconfiguration(), which invalidates sdata->link[] before the matching ieee80211_sta_remove_link() loop walks the link-sta hash). RX dispatch still resolves a link_sta from the hash and then drops into ieee80211_prepare_and_rx_handle(), which dereferences link->conf->addr. Every other user site of ieee80211_rx_data_set_link() checks the return and bails on failure; only this branch did not. Mirror the safe pattern. Fixes: e66b7920aa5a ("wifi: mac80211: fix initialization of rx->link and rx->link_sta") Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Michael Bommarito Link: https://patch.msgid.link/20260422000651.4184602-1-michael.bommarito@gmail.com Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 3e5d1c47a5b0..5a92413a911f 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -5380,7 +5380,9 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, if (!link_sta) goto out; - ieee80211_rx_data_set_link(&rx, link_sta->link_id); + if (!ieee80211_rx_data_set_link(&rx, + link_sta->link_id)) + goto out; } if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) -- cgit v1.2.3 From 7a5b81e0c87a075afd572f659d8eb68c9c4cd2ba Mon Sep 17 00:00:00 2001 From: Catherine Date: Fri, 24 Apr 2026 21:14:36 +0800 Subject: wifi: mac80211: drop stray 'static' from fast-RX rx_result ieee80211_invoke_fast_rx() is documented as safe for parallel RX, but its per-invocation rx_result is declared static. Concurrent callers then share one instance and can overwrite each other's result between ieee80211_rx_mesh_data() and the switch on res. That can make a packet that was queued or consumed by ieee80211_rx_mesh_data() fall through into ieee80211_rx_8023(), or make a packet that should continue return as queued. Make res an automatic variable so each invocation keeps its own result. Fixes: 3468e1e0c639 ("wifi: mac80211: add mesh fast-rx support") Cc: stable@vger.kernel.org Signed-off-by: Catherine Link: https://patch.msgid.link/20260424131435.83212-2-enderaoelyther@gmail.com Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 5a92413a911f..d18e962126ce 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4971,7 +4971,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, struct sk_buff *skb = rx->skb; struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); - static ieee80211_rx_result res; + ieee80211_rx_result res; int orig_len = skb->len; int hdrlen = ieee80211_hdrlen(hdr->frame_control); int snap_offs = hdrlen; -- cgit v1.2.3 From ada95e5e603bc6e353ee029f2ba7a7d9a42ad018 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Wed, 22 Apr 2026 17:06:46 +0300 Subject: tools/selftests: Use a sensible timeout value for iperf3 client The default timeout of cmd() is 5 seconds and Iperf3Runner requests the iperf3 client to run for 10 seconds, which clearly doesn't work since commit [1] enforced the timeout parameter. Use a value derived from duration as timeout (+5 seconds for startup/teardown/various other overhead). [1] commit f0bd19316663 ("selftests: net: fix timeout passed as positional argument to communicate()") Signed-off-by: Cosmin Ratiu Signed-off-by: Steffen Klassert --- tools/testing/selftests/drivers/net/lib/py/load.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/lib/py/load.py b/tools/testing/selftests/drivers/net/lib/py/load.py index f181fa2d38fc..e24660e5c27f 100644 --- a/tools/testing/selftests/drivers/net/lib/py/load.py +++ b/tools/testing/selftests/drivers/net/lib/py/load.py @@ -48,7 +48,10 @@ class Iperf3Runner: Starts the iperf3 client with the configured options. """ cmdline = self._build_client(streams, duration, reverse) - return cmd(cmdline, background=background, host=self.env.remote) + kwargs = {"background": background, "host": self.env.remote} + if not background: + kwargs["timeout"] = duration + 5 + return cmd(cmdline, **kwargs) def measure_bandwidth(self, reverse=False): """ -- cgit v1.2.3 From e64e03b478e2da7093564819e903932fca2ddfa1 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Wed, 22 Apr 2026 17:06:47 +0300 Subject: tools/selftests: Add a VXLAN+IPsec traffic test There are VXLAN tests and IPsec tests, but there is no test that combines the two protocols and exercises the tunnel-over-ipsec code paths. Fix that by adding a traffic test with VXLAN and IPsec using crypto offload. This is runnable on HW which supports ESP offload (so no nsim unfortunately). Traffic is done with iperf3 and the test validates that there are no packet drops and iperf3 can get to at least 100 Mbps (a very conservative value on today's crypto offload HW, as it can typically reach multi-Gbps rates). Ran right now, the test fails due to a recently exposed bug in xfrm, which will be fixed in the next patch: # ./tools/testing/selftests/drivers/net/hw/ipsec_vxlan.py TAP version 13 1..4 # Check| At ./tools/testing/selftests/drivers/net/hw/ipsec_vxlan.py, # line 161, in test_vxlan_ipsec_crypto_offload: # Check| ksft_eq(drops_after - drops_before, 0, # Check failed 189 != 0 TX drops during VXLAN+IPsec # Check| At ./tools/testing/selftests/drivers/net/hw/ipsec_vxlan.py, # line 163, in test_vxlan_ipsec_crypto_offload: # Check| ksft_ge(bw_gbps, 0.1, # Check failed 0.0015058278404812596 < 0.1 Minimum 100Mbps over # VXLAN+IPsec not ok 1 ipsec_vxlan.test_vxlan_ipsec_crypto_offload.outer_v4_inner_v4 ... Signed-off-by: Cosmin Ratiu Signed-off-by: Steffen Klassert --- tools/testing/selftests/drivers/net/hw/Makefile | 1 + tools/testing/selftests/drivers/net/hw/config | 5 + .../selftests/drivers/net/hw/ipsec_vxlan.py | 204 +++++++++++++++++++++ 3 files changed, 210 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/hw/ipsec_vxlan.py diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 85ca4d1ecf9e..3b6ff4708005 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -30,6 +30,7 @@ TEST_PROGS = \ gro_hw.py \ hw_stats_l3.sh \ hw_stats_l3_gre.sh \ + ipsec_vxlan.py \ iou-zcrx.py \ irq.py \ loopback.sh \ diff --git a/tools/testing/selftests/drivers/net/hw/config b/tools/testing/selftests/drivers/net/hw/config index dd50cb8a7911..ae0168c2bbe6 100644 --- a/tools/testing/selftests/drivers/net/hw/config +++ b/tools/testing/selftests/drivers/net/hw/config @@ -12,5 +12,10 @@ CONFIG_NET_IPGRE=y CONFIG_NET_IPGRE_DEMUX=y CONFIG_NETKIT=y CONFIG_NET_SCH_INGRESS=y +CONFIG_INET6_ESP=y +CONFIG_INET6_ESP_OFFLOAD=y +CONFIG_INET_ESP=y +CONFIG_INET_ESP_OFFLOAD=y CONFIG_UDMABUF=y CONFIG_VXLAN=y +CONFIG_XFRM_USER=y diff --git a/tools/testing/selftests/drivers/net/hw/ipsec_vxlan.py b/tools/testing/selftests/drivers/net/hw/ipsec_vxlan.py new file mode 100755 index 000000000000..0740a4d85240 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/ipsec_vxlan.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +"""Traffic test for VXLAN + IPsec crypto-offload.""" + +import os + +from lib.py import ksft_run, ksft_exit, ksft_eq, ksft_ge +from lib.py import ksft_variants, KsftNamedVariant, KsftSkipEx +from lib.py import CmdExitFailure, NetDrvEpEnv, cmd, defer, ethtool, ip +from lib.py import Iperf3Runner + +# Inner tunnel addresses - TEST-NET-2 (RFC 5737) / doc prefix (RFC 3849) +INNER_V4_LOCAL = "198.51.100.1" +INNER_V4_REMOTE = "198.51.100.2" +INNER_V6_LOCAL = "2001:db8:100::1" +INNER_V6_REMOTE = "2001:db8:100::2" + +# ESP parameters +SPI_OUT = "0x1000" +SPI_IN = "0x1001" +# 128-bit key + 32-bit salt = 20 bytes hex, 128-bit ICV +ESP_AEAD = "aead 'rfc4106(gcm(aes))' 0x" + "01" * 20 + " 128" + + +def xfrm(args, host=None): + """Runs 'ip xfrm' via shell to preserve parentheses in algo names.""" + cmd(f"ip xfrm {args}", shell=True, host=host) + + +def check_xfrm_offload_support(): + """Skips if iproute2 lacks xfrm offload support.""" + out = cmd("ip xfrm state help", fail=False) + if "offload" not in out.stdout + out.stderr: + raise KsftSkipEx("iproute2 too old, missing xfrm offload") + + +def check_esp_hw_offload(cfg): + """Skips if device lacks esp-hw-offload support.""" + check_xfrm_offload_support() + try: + feat = ethtool(f"-k {cfg.ifname}", json=True)[0] + except (CmdExitFailure, IndexError) as e: + raise KsftSkipEx(f"can't query features: {e}") from e + if not feat.get("esp-hw-offload", {}).get("active"): + raise KsftSkipEx("Device does not support esp-hw-offload") + + +def get_tx_drops(cfg): + """Returns TX dropped counter from the physical device.""" + stats = ip("-s -s link show dev " + cfg.ifname, json=True)[0] + return stats["stats64"]["tx"]["dropped"] + + +def setup_vxlan_ipsec(cfg, outer_ipver, inner_ipver): + """Sets up VXLAN tunnel with IPsec transport-mode crypto-offload.""" + vxlan_name = f"vx{os.getpid()}" + local_addr = cfg.addr_v[outer_ipver] + remote_addr = cfg.remote_addr_v[outer_ipver] + + if inner_ipver == "4": + inner_local = f"{INNER_V4_LOCAL}/24" + inner_remote = f"{INNER_V4_REMOTE}/24" + addr_extra = "" + else: + inner_local = f"{INNER_V6_LOCAL}/64" + inner_remote = f"{INNER_V6_REMOTE}/64" + addr_extra = " nodad" + + if outer_ipver == "6": + vxlan_opts = "udp6zerocsumtx udp6zerocsumrx" + else: + vxlan_opts = "noudpcsum" + + # VXLAN tunnel - local side + ip(f"link add {vxlan_name} type vxlan id 100 dstport 4789 {vxlan_opts} " + f"local {local_addr} remote {remote_addr} dev {cfg.ifname}") + defer(ip, f"link del {vxlan_name}") + ip(f"addr add {inner_local} dev {vxlan_name}{addr_extra}") + ip(f"link set {vxlan_name} up") + + # VXLAN tunnel - remote side + ip(f"link add {vxlan_name} type vxlan id 100 dstport 4789 {vxlan_opts} " + f"local {remote_addr} remote {local_addr} dev {cfg.remote_ifname}", + host=cfg.remote) + defer(ip, f"link del {vxlan_name}", host=cfg.remote) + ip(f"addr add {inner_remote} dev {vxlan_name}{addr_extra}", + host=cfg.remote) + ip(f"link set {vxlan_name} up", host=cfg.remote) + + # xfrm state - local outbound SA + xfrm(f"state add src {local_addr} dst {remote_addr} " + f"proto esp spi {SPI_OUT} " + f"{ESP_AEAD} " + f"mode transport offload crypto dev {cfg.ifname} dir out") + defer(xfrm, f"state del src {local_addr} dst {remote_addr} " + f"proto esp spi {SPI_OUT}") + + # xfrm state - local inbound SA + xfrm(f"state add src {remote_addr} dst {local_addr} " + f"proto esp spi {SPI_IN} " + f"{ESP_AEAD} " + f"mode transport offload crypto dev {cfg.ifname} dir in") + defer(xfrm, f"state del src {remote_addr} dst {local_addr} " + f"proto esp spi {SPI_IN}") + + # xfrm state - remote outbound SA (mirror, software crypto) + xfrm(f"state add src {remote_addr} dst {local_addr} " + f"proto esp spi {SPI_IN} " + f"{ESP_AEAD} " + f"mode transport", + host=cfg.remote) + defer(xfrm, f"state del src {remote_addr} dst {local_addr} " + f"proto esp spi {SPI_IN}", host=cfg.remote) + + # xfrm state - remote inbound SA (mirror, software crypto) + xfrm(f"state add src {local_addr} dst {remote_addr} " + f"proto esp spi {SPI_OUT} " + f"{ESP_AEAD} " + f"mode transport", + host=cfg.remote) + defer(xfrm, f"state del src {local_addr} dst {remote_addr} " + f"proto esp spi {SPI_OUT}", host=cfg.remote) + + # xfrm policy - local out + xfrm(f"policy add src {local_addr} dst {remote_addr} " + f"proto udp dport 4789 dir out " + f"tmpl src {local_addr} dst {remote_addr} proto esp mode transport") + defer(xfrm, f"policy del src {local_addr} dst {remote_addr} " + f"proto udp dport 4789 dir out") + + # xfrm policy - local in + xfrm(f"policy add src {remote_addr} dst {local_addr} " + f"proto udp dport 4789 dir in " + f"tmpl src {remote_addr} dst {local_addr} proto esp mode transport") + defer(xfrm, f"policy del src {remote_addr} dst {local_addr} " + f"proto udp dport 4789 dir in") + + # xfrm policy - remote out + xfrm(f"policy add src {remote_addr} dst {local_addr} " + f"proto udp dport 4789 dir out " + f"tmpl src {remote_addr} dst {local_addr} proto esp mode transport", + host=cfg.remote) + defer(xfrm, f"policy del src {remote_addr} dst {local_addr} " + f"proto udp dport 4789 dir out", host=cfg.remote) + + # xfrm policy - remote in + xfrm(f"policy add src {local_addr} dst {remote_addr} " + f"proto udp dport 4789 dir in " + f"tmpl src {local_addr} dst {remote_addr} proto esp mode transport", + host=cfg.remote) + defer(xfrm, f"policy del src {local_addr} dst {remote_addr} " + f"proto udp dport 4789 dir in", host=cfg.remote) + + +def _vxlan_ipsec_variants(): + """Generates outer/inner IP version variants.""" + for outer in ["4", "6"]: + for inner in ["4", "6"]: + yield KsftNamedVariant(f"outer_v{outer}_inner_v{inner}", outer, inner) + + +@ksft_variants(_vxlan_ipsec_variants()) +def test_vxlan_ipsec_crypto_offload(cfg, outer_ipver, inner_ipver): + """Tests VXLAN+IPsec crypto-offload has no TX drops.""" + cfg.require_ipver(outer_ipver) + check_esp_hw_offload(cfg) + + setup_vxlan_ipsec(cfg, outer_ipver, inner_ipver) + + if inner_ipver == "4": + inner_local = INNER_V4_LOCAL + inner_remote = INNER_V4_REMOTE + ping = "ping" + else: + inner_local = INNER_V6_LOCAL + inner_remote = INNER_V6_REMOTE + ping = "ping -6" + + cmd(f"{ping} -c 1 -W 2 {inner_remote}") + + drops_before = get_tx_drops(cfg) + + runner = Iperf3Runner(cfg, server_ip=inner_local, + client_ip=inner_remote) + bw_gbps = runner.measure_bandwidth(reverse=True) + + cfg.wait_hw_stats_settle() + drops_after = get_tx_drops(cfg) + + ksft_eq(drops_after - drops_before, 0, + comment="TX drops during VXLAN+IPsec") + ksft_ge(bw_gbps, 0.1, + comment="Minimum 100Mbps over VXLAN+IPsec") + + +def main(): + """Runs VXLAN+IPsec crypto-offload GSO selftest.""" + with NetDrvEpEnv(__file__, nsim_test=False) as cfg: + ksft_run([test_vxlan_ipsec_crypto_offload], args=(cfg,)) + ksft_exit() + + +if __name__ == "__main__": + main() -- cgit v1.2.3 From fa90a3145c0340c3f624206a81637c542254ea1d Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Wed, 22 Apr 2026 17:06:48 +0300 Subject: xfrm: Don't clobber inner headers when already set On VXLAN over IPsec egress, xfrm{4,6}_transport_output() blindly overwrite inner_transport_header (== the inner TCP header saved in VXLAN iptunnel_handle_offloads() -> skb_reset_inner_headers()) with the current transport_header (== the VXLAN outer UDP header set by udp_tunnel_xmit_skb()). This was a latent bug, harmless until commit [1] added a doff validation check in qdisc_pkt_len_segs_init() for encapsulated GSO packets. With the wrong inner_transport_header set by xfrm, qdisc_pkt_len_segs_init() interprets inner_transport_header as a TCP header, reads doff=0 from the upper byte of the VNI and drops the packet with DROP_REASON_SKB_BAD_GSO. Besides the use in GSO to determine the header size of segmented packets, inner_transport_header might be used by drivers to set up inner checksum offloading by pointing the HW to the inner transport header. A quick browse through available drivers shows that mlx5 uses skb->csum_start specifically for this scenario, while others either don't support VXLAN over IPsec crypto offload (ixgbe) or the HW is capable of parsing the packets itself (nfp, Chelsio). But in all cases, it is more correct to let the inner_transport_header point to the innermost header instead of overwriting it in xfrm. So fix this by guarding all four inner header save sites in xfrm_output.c (xfrm{4,6}_transport_output, xfrm{4,6}_tunnel_encap_add) with a check for skb->inner_protocol. When inner_protocol is set, a tunnel layer (VXLAN, Geneve, GRE, etc.) has already saved the correct inner header offsets and they must not be overwritten. When inner_protocol is zero, no prior tunnel encapsulation exists and xfrm must save the inner headers itself. The tunnel mode checks are only added for completion, since they aren't strictly required, as xfrm_output() forces software GSO in tunnel mode before encap. This makes the previously added test pass: # ./tools/testing/selftests/drivers/net/hw/ipsec_vxlan.py TAP version 13 1..4 ok 1 ipsec_vxlan.test_vxlan_ipsec_crypto_offload.outer_v4_inner_v4 ok 2 ipsec_vxlan.test_vxlan_ipsec_crypto_offload.outer_v4_inner_v6 ok 3 ipsec_vxlan.test_vxlan_ipsec_crypto_offload.outer_v6_inner_v4 ok 4 ipsec_vxlan.test_vxlan_ipsec_crypto_offload.outer_v6_inner_v6 # Totals: pass:4 fail:0 xfail:0 xpass:0 skip:0 error:0 [1] commit 7fb4c1967011 ("net: pull headers in qdisc_pkt_len_segs_init()") Fixes: f1bd7d659ef0 ("xfrm: Add encapsulation header offsets while SKB is not encrypted") Signed-off-by: Cosmin Ratiu Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_output.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index a9652b422f51..cc35c2fcbbe0 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -66,7 +66,9 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) struct iphdr *iph = ip_hdr(skb); int ihl = iph->ihl * 4; - skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + if (!skb->inner_protocol) + skb_set_inner_transport_header(skb, + skb_transport_offset(skb)); skb_set_network_header(skb, -x->props.header_len); skb->mac_header = skb->network_header + @@ -167,7 +169,9 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb) int hdr_len; iph = ipv6_hdr(skb); - skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + if (!skb->inner_protocol) + skb_set_inner_transport_header(skb, + skb_transport_offset(skb)); hdr_len = xfrm6_hdr_offset(x, skb, &prevhdr); if (hdr_len < 0) @@ -276,8 +280,10 @@ static int xfrm4_tunnel_encap_add(struct xfrm_state *x, struct sk_buff *skb) struct iphdr *top_iph; int flags; - skb_set_inner_network_header(skb, skb_network_offset(skb)); - skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + if (!skb->inner_protocol) { + skb_set_inner_network_header(skb, skb_network_offset(skb)); + skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + } skb_set_network_header(skb, -x->props.header_len); skb->mac_header = skb->network_header + @@ -321,8 +327,10 @@ static int xfrm6_tunnel_encap_add(struct xfrm_state *x, struct sk_buff *skb) struct ipv6hdr *top_iph; int dsfield; - skb_set_inner_network_header(skb, skb_network_offset(skb)); - skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + if (!skb->inner_protocol) { + skb_set_inner_network_header(skb, skb_network_offset(skb)); + skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + } skb_set_network_header(skb, -x->props.header_len); skb->mac_header = skb->network_header + -- cgit v1.2.3 From db57a1aa54ff68669781976e4edb045e09e2b65b Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Thu, 23 Apr 2026 02:38:46 +0900 Subject: wifi: rsi: fix kthread lifetime race between self-exit and external-stop RSI driver use both self-exit(kthread_complete_and_exit) and external-stop (kthread_stop) when killing a kthread. Generally, kthread_stop() is called first, and in this case, no particular issues occur. However, in rare instances where kthread_complete_and_exit() is called first and then kthread_stop() is called, a UAF occurs because the kthread object, which has already exited and been freed, is accessed again. Therefore, to prevent this with minimal modification, you must remove kthread_stop() and change the code to wait until the self-exit operation is completed. Cc: Reported-by: syzbot+5de83f57cd8531f55596@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/69e5d03b.a00a0220.1bd0ca.0064.GAE@google.com/ Fixes: 4c62764d0fc2 ("rsi: improve kernel thread handling to fix kernel panic") Signed-off-by: Jeongjun Park Link: https://patch.msgid.link/20260422173846.37640-1-aha310510@gmail.com Signed-off-by: Johannes Berg --- drivers/net/wireless/rsi/rsi_common.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/rsi/rsi_common.h b/drivers/net/wireless/rsi/rsi_common.h index 591602beeec6..3cdf9ded876d 100644 --- a/drivers/net/wireless/rsi/rsi_common.h +++ b/drivers/net/wireless/rsi/rsi_common.h @@ -70,12 +70,11 @@ static inline int rsi_create_kthread(struct rsi_common *common, return 0; } -static inline int rsi_kill_thread(struct rsi_thread *handle) +static inline void rsi_kill_thread(struct rsi_thread *handle) { atomic_inc(&handle->thread_done); rsi_set_event(&handle->event); - - return kthread_stop(handle->task); + wait_for_completion(&handle->completion); } void rsi_mac80211_detach(struct rsi_hw *hw); -- cgit v1.2.3 From 28465227c80fe417b4013c432be1f3737cb9f9a3 Mon Sep 17 00:00:00 2001 From: Ruijie Li Date: Wed, 29 Apr 2026 00:41:43 +0800 Subject: xfrm: provide message size for XFRM_MSG_MAPPING The compat 64=>32 translation path handles XFRM_MSG_MAPPING, but xfrm_msg_min[] does not provide the native payload size for this message type. Add the missing XFRM_MSG_MAPPING entry so compat translation can size and translate mapping notifications correctly. Fixes: 5461fc0c8d9f ("xfrm/compat: Add 64=>32-bit messages translator") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Signed-off-by: Ruijie Li Signed-off-by: Ren Wei Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index d56450f61669..38a90e5ee3d9 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -3323,6 +3323,7 @@ const int xfrm_msg_min[XFRM_NR_MSGTYPES] = { [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = sizeof(u32), [XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = sizeof(u32), [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = sizeof(u32), + [XFRM_MSG_MAPPING - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_mapping), [XFRM_MSG_SETDEFAULT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default), [XFRM_MSG_GETDEFAULT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default), }; -- cgit v1.2.3 From 14acf9652e5690de3c7486c6db5fb8dafd0a32a3 Mon Sep 17 00:00:00 2001 From: Michal Kosiorek Date: Wed, 29 Apr 2026 10:54:51 +0200 Subject: xfrm: defensively unhash xfrm_state lists in __xfrm_state_delete KASAN reproduces a slab-use-after-free in __xfrm_state_delete()'s hlist_del_rcu calls under syzkaller load on linux-6.12.y stable (reproduced on 6.12.47, also reachable via the same code path on torvalds/master and on the ipsec tree). Nine unique signatures cluster in the xfrm_state lifecycle, the load-bearing one being: BUG: KASAN: slab-use-after-free in __hlist_del include/linux/list.h:990 [inline] BUG: KASAN: slab-use-after-free in hlist_del_rcu include/linux/rculist.h:516 [inline] BUG: KASAN: slab-use-after-free in __xfrm_state_delete net/xfrm/xfrm_state.c Write of size 8 at addr ffff8881198bcb70 by task kworker/u8:9/435 Workqueue: netns cleanup_net Call Trace: __hlist_del / hlist_del_rcu __xfrm_state_delete xfrm_state_delete xfrm_state_flush xfrm_state_fini ops_exit_list cleanup_net The other observed signatures hit the same slab object from __xfrm_state_lookup, xfrm_alloc_spi, __xfrm_state_insert and an OOB write variant of __xfrm_state_delete, all on the byseq/byspi hash chains. __xfrm_state_delete() guards its byseq and byspi unhashes with value-based predicates: if (x->km.seq) hlist_del_rcu(&x->byseq); if (x->id.spi) hlist_del_rcu(&x->byspi); while everywhere else in the file (e.g. state_cache, state_cache_input) the safer hlist_unhashed() check is used. xfrm_alloc_spi() sets x->id.spi = newspi inside xfrm_state_lock and then immediately inserts into byspi, but a path that observes x->id.spi != 0 outside of xfrm_state_lock can still skip-or-hit the byspi unhash inconsistently with whether x is actually on the list. The same holds for x->km.seq versus byseq, and the bydst/bysrc unhashes have no predicate at all, so a second __xfrm_state_delete() on the same object writes through LIST_POISON pprev. The defensive change here: - Use hlist_del_init_rcu() instead of hlist_del_rcu() on bydst, bysrc, byseq and byspi so a second deletion is a no-op rather than a write through LIST_POISON pprev. The byseq/byspi nodes are already initialised in xfrm_state_alloc(). - Test hlist_unhashed() rather than the value predicate for byseq/byspi, so the unhash decision tracks list state rather than mutable scalar fields. Empirical verification: applied this patch on top of v6.12.47, rebuilt, and re-ran the same syzkaller harness for 1h16m on a previously-crashy configuration that produced ~100 hits each of slab-use-after-free Read in xfrm_alloc_spi / Read in __xfrm_state_lookup / Write in __xfrm_state_delete. After the patch, 7.1M execs across 32 VMs at ~1550 exec/sec produced zero xfrm_state UAF/OOB hits. /proc/slabinfo confirms the xfrm_state slab is actively allocated and freed during the run (~143 KiB resident), so the fuzzer is still exercising those code paths -- they just no longer crash. Reproduction: - Linux 6.12.47 x86_64 + KASAN_GENERIC + KASAN_INLINE + KCOV - syzkaller @ 746545b8b1e4c3a128db8652b340d3df90ce61db - 32 QEMU/KVM VMs x 2 vCPU on AWS c5.metal bare metal - 9 unique signatures collected in ~9h, all within xfrm_state lifecycle Fixes: fe9f1d8779cb ("xfrm: add state hashtable keyed by seq") Fixes: 7b4dc3600e48 ("[XFRM]: Do not add a state whose SPI is zero to the SPI hash.") Reported-by: Michal Kosiorek Tested-by: Michal Kosiorek Cc: stable@vger.kernel.org Signed-off-by: Michal Kosiorek Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_state.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 1748d374abca..686014d39429 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -818,17 +818,17 @@ int __xfrm_state_delete(struct xfrm_state *x) spin_lock(&net->xfrm.xfrm_state_lock); list_del(&x->km.all); - hlist_del_rcu(&x->bydst); - hlist_del_rcu(&x->bysrc); - if (x->km.seq) - hlist_del_rcu(&x->byseq); + hlist_del_init_rcu(&x->bydst); + hlist_del_init_rcu(&x->bysrc); + if (!hlist_unhashed(&x->byseq)) + hlist_del_init_rcu(&x->byseq); if (!hlist_unhashed(&x->state_cache)) hlist_del_rcu(&x->state_cache); if (!hlist_unhashed(&x->state_cache_input)) hlist_del_rcu(&x->state_cache_input); - if (x->id.spi) - hlist_del_rcu(&x->byspi); + if (!hlist_unhashed(&x->byspi)) + hlist_del_init_rcu(&x->byspi); net->xfrm.state_num--; xfrm_nat_keepalive_state_updated(x); spin_unlock(&net->xfrm.xfrm_state_lock); -- cgit v1.2.3 From 1049970d7583194eedc30e45a3c898b2cb1c30ba Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 27 Apr 2026 14:34:45 +0200 Subject: netfilter: replace skb_try_make_writable() by skb_ensure_writable() skb_try_make_writable() only works on clones and uncloned packets might have their network header in paged fragments. nft_fwd needs to work for the ingress and egress hooks, but the egress hook where skb->data points to the mac header, use skb_network_offset() to include the mac header. The flowtable is fine since it already uses the transport offset. Fixes: d32de98ea70f ("netfilter: nft_fwd_netdev: allow to forward packets via neighbour layer") Fixes: 7d2086871762 ("netfilter: nf_flow_table: move ipv4 offload hook code to nf_flow_table") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 4 ++-- net/netfilter/nft_fwd_netdev.c | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index fd56d663cb5b..dbd7644fdbeb 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -524,7 +524,7 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, return 0; } - if (skb_try_make_writable(skb, thoff + ctx->hdrsize)) + if (skb_ensure_writable(skb, thoff + ctx->hdrsize)) return -1; flow_offload_refresh(flow_table, flow, false); @@ -1037,7 +1037,7 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx, return 0; } - if (skb_try_make_writable(skb, thoff + ctx->hdrsize)) + if (skb_ensure_writable(skb, thoff + ctx->hdrsize)) return -1; flow_offload_refresh(flow_table, flow, false); diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 4bce36c3a6a0..2cc809303ce8 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -100,6 +100,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, int oif = regs->data[priv->sreg_dev]; unsigned int verdict = NF_STOLEN; struct sk_buff *skb = pkt->skb; + int nhoff = skb_network_offset(skb); struct net_device *dev; int neigh_table; @@ -111,7 +112,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, verdict = NFT_BREAK; goto out; } - if (skb_try_make_writable(skb, sizeof(*iph))) { + if (skb_ensure_writable(skb, nhoff + sizeof(*iph))) { verdict = NF_DROP; goto out; } @@ -132,7 +133,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, verdict = NFT_BREAK; goto out; } - if (skb_try_make_writable(skb, sizeof(*ip6h))) { + if (skb_ensure_writable(skb, nhoff + sizeof(*ip6h))) { verdict = NF_DROP; goto out; } -- cgit v1.2.3 From 0a0b35f0bf10b4c2be607465f5c9c12c8681305b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 27 Apr 2026 14:34:48 +0200 Subject: netfilter: nft_fwd_netdev: add device and headroom validate with neigh forwarding The ttl field has been decremented already and evaluation of this rule would proceed, just drop this packet instead if there is no destination device to forwards this packet. This is exactly what nf_dup already does in this case. Moreover, check for headroom and call skb_expand_head() like in the IP output path to ensure there is sufficient headroom when forwarding this via neigh_xmit(). Fixes: d32de98ea70f ("netfilter: nft_fwd_netdev: allow to forward packets via neighbour layer") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_fwd_netdev.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 2cc809303ce8..605b1d42abce 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -102,6 +102,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, struct sk_buff *skb = pkt->skb; int nhoff = skb_network_offset(skb); struct net_device *dev; + unsigned int hh_len; int neigh_table; switch (priv->nfproto) { @@ -153,8 +154,19 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, } dev = dev_get_by_index_rcu(nft_net(pkt), oif); - if (dev == NULL) - return; + if (dev == NULL) { + verdict = NF_DROP; + goto out; + } + + hh_len = LL_RESERVED_SPACE(dev); + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { + skb = skb_expand_head(skb, hh_len); + if (!skb) { + verdict = NF_STOLEN; + goto out; + } + } skb->dev = dev; skb_clear_tstamp(skb); -- cgit v1.2.3 From 1d47b55b36d2ec73fe6901212c8b28a593c3b27c Mon Sep 17 00:00:00 2001 From: Weiming Shi Date: Mon, 27 Apr 2026 14:34:50 +0200 Subject: netfilter: nft_fwd_netdev: use recursion counter in neigh egress path nft_fwd_neigh can be used in egress chains (NF_NETDEV_EGRESS). When the forwarding rule targets the same device or two devices forward to each other, neigh_xmit() triggers dev_queue_xmit() which re-enters nf_hook_egress(), causing infinite recursion and stack overflow. Move the nf_get_nf_dup_skb_recursion() accessor and NF_RECURSION_LIMIT to the shared header nf_dup_netdev.h as a static inline, so that nft_fwd_netdev can use the recursion counter directly without exported function call overhead. Guard neigh_xmit() with the same recursion limit already used in nf_do_netdev_egress(). [ Updated to cache the nf_get_nf_dup_skb_recursion pointer. --pablo ] Fixes: f87b9464d152 ("netfilter: nft_fwd_netdev: Support egress hook") Reported-by: Xiang Mei Signed-off-by: Weiming Shi Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_dup_netdev.h | 13 +++++++++++++ net/netfilter/nf_dup_netdev.c | 16 ---------------- net/netfilter/nft_fwd_netdev.c | 8 ++++++++ 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/include/net/netfilter/nf_dup_netdev.h b/include/net/netfilter/nf_dup_netdev.h index b175d271aec9..609bcf422a9b 100644 --- a/include/net/netfilter/nf_dup_netdev.h +++ b/include/net/netfilter/nf_dup_netdev.h @@ -3,10 +3,23 @@ #define _NF_DUP_NETDEV_H_ #include +#include +#include void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif); void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif); +#define NF_RECURSION_LIMIT 2 + +static inline u8 *nf_get_nf_dup_skb_recursion(void) +{ +#ifndef CONFIG_PREEMPT_RT + return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion); +#else + return ¤t->net_xmit.nf_dup_skb_recursion; +#endif +} + struct nft_offload_ctx; struct nft_flow_rule; diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index e348fb90b8dc..3b0a70e154cd 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -13,22 +13,6 @@ #include #include -#define NF_RECURSION_LIMIT 2 - -#ifndef CONFIG_PREEMPT_RT -static u8 *nf_get_nf_dup_skb_recursion(void) -{ - return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion); -} -#else - -static u8 *nf_get_nf_dup_skb_recursion(void) -{ - return ¤t->net_xmit.nf_dup_skb_recursion; -} - -#endif - static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev, enum nf_dev_hooks hook) { diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 605b1d42abce..b9e88d7cf308 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -95,6 +95,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { + u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion(); struct nft_fwd_neigh *priv = nft_expr_priv(expr); void *addr = ®s->data[priv->sreg_addr]; int oif = regs->data[priv->sreg_dev]; @@ -153,6 +154,11 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, goto out; } + if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT) { + verdict = NF_DROP; + goto out; + } + dev = dev_get_by_index_rcu(nft_net(pkt), oif); if (dev == NULL) { verdict = NF_DROP; @@ -170,7 +176,9 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, skb->dev = dev; skb_clear_tstamp(skb); + (*nf_dup_skb_recursion)++; neigh_xmit(neigh_table, dev, addr, skb); + (*nf_dup_skb_recursion)--; out: regs->verdict.code = verdict; } -- cgit v1.2.3 From 6813985ca456d1f5677ad9554f55805cbf27e16f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 28 Apr 2026 17:35:18 +0200 Subject: netfilter: x_tables: add .check_hooks to matches and targets Add a new .check_hooks interface for checking if the match/target is used from the validate hook according to its configuration. Move existing conditional hook check based on the match/target configuration from .checkentry to .check_hooks for the following matches/targets: - addrtype - devgroup - physdev - policy - set - TCPMSS - SET This is a preparation patch to fix nft_compat, not functional changes are intended. Based on patch from Florian Westphal. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 8 ++++ net/netfilter/x_tables.c | 79 ++++++++++++++++++++++++++++++++++---- net/netfilter/xt_TCPMSS.c | 33 ++++++++-------- net/netfilter/xt_addrtype.c | 25 +++++++++--- net/netfilter/xt_devgroup.c | 18 ++++++--- net/netfilter/xt_physdev.c | 20 +++++++--- net/netfilter/xt_policy.c | 24 +++++++++--- net/netfilter/xt_set.c | 39 ++++++++++++------- 8 files changed, 187 insertions(+), 59 deletions(-) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 77c778d84d4c..a81b46af5118 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -146,6 +146,9 @@ struct xt_match { /* Called when user tries to insert an entry of this type. */ int (*checkentry)(const struct xt_mtchk_param *); + /* Called to validate hooks based on the match configuration. */ + int (*check_hooks)(const struct xt_mtchk_param *); + /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_mtdtor_param *); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT @@ -187,6 +190,9 @@ struct xt_target { /* Should return 0 on success or an error code otherwise (-Exxxx). */ int (*checkentry)(const struct xt_tgchk_param *); + /* Called to validate hooks based on the target configuration. */ + int (*check_hooks)(const struct xt_tgchk_param *); + /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_tgdtor_param *); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT @@ -279,8 +285,10 @@ bool xt_find_jump_offset(const unsigned int *offsets, int xt_check_proc_name(const char *name, unsigned int size); +int xt_check_hooks_match(struct xt_mtchk_param *par); int xt_check_match(struct xt_mtchk_param *, unsigned int size, u16 proto, bool inv_proto); +int xt_check_hooks_target(struct xt_tgchk_param *par); int xt_check_target(struct xt_tgchk_param *, unsigned int size, u16 proto, bool inv_proto); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 9f837fb5ceb4..2c67c2e6b132 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -477,11 +477,9 @@ int xt_check_proc_name(const char *name, unsigned int size) } EXPORT_SYMBOL(xt_check_proc_name); -int xt_check_match(struct xt_mtchk_param *par, - unsigned int size, u16 proto, bool inv_proto) +static int xt_check_match_common(struct xt_mtchk_param *par, + unsigned int size, u16 proto, bool inv_proto) { - int ret; - if (XT_ALIGN(par->match->matchsize) != size && par->match->matchsize != -1) { /* @@ -530,6 +528,14 @@ int xt_check_match(struct xt_mtchk_param *par, par->match->proto); return -EINVAL; } + + return 0; +} + +static int xt_checkentry_match(struct xt_mtchk_param *par) +{ + int ret; + if (par->match->checkentry != NULL) { ret = par->match->checkentry(par); if (ret < 0) @@ -538,8 +544,34 @@ int xt_check_match(struct xt_mtchk_param *par, /* Flag up potential errors. */ return -EIO; } + + return 0; +} + +int xt_check_hooks_match(struct xt_mtchk_param *par) +{ + if (par->match->check_hooks != NULL) + return par->match->check_hooks(par); + return 0; } +EXPORT_SYMBOL_GPL(xt_check_hooks_match); + +int xt_check_match(struct xt_mtchk_param *par, + unsigned int size, u16 proto, bool inv_proto) +{ + int ret; + + ret = xt_check_match_common(par, size, proto, inv_proto); + if (ret < 0) + return ret; + + ret = xt_check_hooks_match(par); + if (ret < 0) + return ret; + + return xt_checkentry_match(par); +} EXPORT_SYMBOL_GPL(xt_check_match); /** xt_check_entry_match - check that matches end before start of target @@ -1012,11 +1044,9 @@ bool xt_find_jump_offset(const unsigned int *offsets, } EXPORT_SYMBOL(xt_find_jump_offset); -int xt_check_target(struct xt_tgchk_param *par, - unsigned int size, u16 proto, bool inv_proto) +static int xt_check_target_common(struct xt_tgchk_param *par, + unsigned int size, u16 proto, bool inv_proto) { - int ret; - if (XT_ALIGN(par->target->targetsize) != size) { pr_err_ratelimited("%s_tables: %s.%u target: invalid size %u (kernel) != (user) %u\n", xt_prefix[par->family], par->target->name, @@ -1061,6 +1091,23 @@ int xt_check_target(struct xt_tgchk_param *par, par->target->proto); return -EINVAL; } + + return 0; +} + +int xt_check_hooks_target(struct xt_tgchk_param *par) +{ + if (par->target->check_hooks != NULL) + return par->target->check_hooks(par); + + return 0; +} +EXPORT_SYMBOL_GPL(xt_check_hooks_target); + +static int xt_checkentry_target(struct xt_tgchk_param *par) +{ + int ret; + if (par->target->checkentry != NULL) { ret = par->target->checkentry(par); if (ret < 0) @@ -1071,6 +1118,22 @@ int xt_check_target(struct xt_tgchk_param *par, } return 0; } + +int xt_check_target(struct xt_tgchk_param *par, + unsigned int size, u16 proto, bool inv_proto) +{ + int ret; + + ret = xt_check_target_common(par, size, proto, inv_proto); + if (ret < 0) + return ret; + + ret = xt_check_hooks_target(par); + if (ret < 0) + return ret; + + return xt_checkentry_target(par); +} EXPORT_SYMBOL_GPL(xt_check_target); /** diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 116a885adb3c..80e1634bc51f 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -247,6 +247,21 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par) } #endif +static int tcpmss_tg4_check_hooks(const struct xt_tgchk_param *par) +{ + const struct xt_tcpmss_info *info = par->targinfo; + + if (info->mss == XT_TCPMSS_CLAMP_PMTU && + (par->hook_mask & ~((1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING))) != 0) { + pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); + return -EINVAL; + } + + return 0; +} + /* Must specify -p tcp --syn */ static inline bool find_syn_match(const struct xt_entry_match *m) { @@ -262,17 +277,9 @@ static inline bool find_syn_match(const struct xt_entry_match *m) static int tcpmss_tg4_check(const struct xt_tgchk_param *par) { - const struct xt_tcpmss_info *info = par->targinfo; const struct ipt_entry *e = par->entryinfo; const struct xt_entry_match *ematch; - if (info->mss == XT_TCPMSS_CLAMP_PMTU && - (par->hook_mask & ~((1 << NF_INET_FORWARD) | - (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_POST_ROUTING))) != 0) { - pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); - return -EINVAL; - } if (par->nft_compat) return 0; @@ -286,17 +293,9 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par) #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static int tcpmss_tg6_check(const struct xt_tgchk_param *par) { - const struct xt_tcpmss_info *info = par->targinfo; const struct ip6t_entry *e = par->entryinfo; const struct xt_entry_match *ematch; - if (info->mss == XT_TCPMSS_CLAMP_PMTU && - (par->hook_mask & ~((1 << NF_INET_FORWARD) | - (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_POST_ROUTING))) != 0) { - pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); - return -EINVAL; - } if (par->nft_compat) return 0; @@ -312,6 +311,7 @@ static struct xt_target tcpmss_tg_reg[] __read_mostly = { { .family = NFPROTO_IPV4, .name = "TCPMSS", + .check_hooks = tcpmss_tg4_check_hooks, .checkentry = tcpmss_tg4_check, .target = tcpmss_tg4, .targetsize = sizeof(struct xt_tcpmss_info), @@ -322,6 +322,7 @@ static struct xt_target tcpmss_tg_reg[] __read_mostly = { { .family = NFPROTO_IPV6, .name = "TCPMSS", + .check_hooks = tcpmss_tg4_check_hooks, .checkentry = tcpmss_tg6_check, .target = tcpmss_tg6, .targetsize = sizeof(struct xt_tcpmss_info), diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index a77088943107..913dbe3aa5e2 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -153,14 +153,10 @@ addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) return ret; } -static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) +static int addrtype_mt_check_hooks(const struct xt_mtchk_param *par) { - const char *errmsg = "both incoming and outgoing interface limitation cannot be selected"; struct xt_addrtype_info_v1 *info = par->matchinfo; - - if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN && - info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) - goto err; + const char *errmsg; if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN)) && @@ -176,6 +172,21 @@ static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) goto err; } + return 0; +err: + pr_info_ratelimited("%s\n", errmsg); + return -EINVAL; +} + +static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) +{ + const char *errmsg = "both incoming and outgoing interface limitation cannot be selected"; + struct xt_addrtype_info_v1 *info = par->matchinfo; + + if (info->flags & XT_ADDRTYPE_LIMIT_IFACE_IN && + info->flags & XT_ADDRTYPE_LIMIT_IFACE_OUT) + goto err; + #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) if (par->family == NFPROTO_IPV6) { if ((info->source | info->dest) & XT_ADDRTYPE_BLACKHOLE) { @@ -211,6 +222,7 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = { .family = NFPROTO_IPV4, .revision = 1, .match = addrtype_mt_v1, + .check_hooks = addrtype_mt_check_hooks, .checkentry = addrtype_mt_checkentry_v1, .matchsize = sizeof(struct xt_addrtype_info_v1), .me = THIS_MODULE @@ -221,6 +233,7 @@ static struct xt_match addrtype_mt_reg[] __read_mostly = { .family = NFPROTO_IPV6, .revision = 1, .match = addrtype_mt_v1, + .check_hooks = addrtype_mt_check_hooks, .checkentry = addrtype_mt_checkentry_v1, .matchsize = sizeof(struct xt_addrtype_info_v1), .me = THIS_MODULE diff --git a/net/netfilter/xt_devgroup.c b/net/netfilter/xt_devgroup.c index 9520dd00070b..6d1a44ab5eee 100644 --- a/net/netfilter/xt_devgroup.c +++ b/net/netfilter/xt_devgroup.c @@ -33,14 +33,10 @@ static bool devgroup_mt(const struct sk_buff *skb, struct xt_action_param *par) return true; } -static int devgroup_mt_checkentry(const struct xt_mtchk_param *par) +static int devgroup_mt_check_hooks(const struct xt_mtchk_param *par) { const struct xt_devgroup_info *info = par->matchinfo; - if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC | - XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST)) - return -EINVAL; - if (info->flags & XT_DEVGROUP_MATCH_SRC && par->hook_mask & ~((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | @@ -56,9 +52,21 @@ static int devgroup_mt_checkentry(const struct xt_mtchk_param *par) return 0; } +static int devgroup_mt_checkentry(const struct xt_mtchk_param *par) +{ + const struct xt_devgroup_info *info = par->matchinfo; + + if (info->flags & ~(XT_DEVGROUP_MATCH_SRC | XT_DEVGROUP_INVERT_SRC | + XT_DEVGROUP_MATCH_DST | XT_DEVGROUP_INVERT_DST)) + return -EINVAL; + + return 0; +} + static struct xt_match devgroup_mt_reg __read_mostly = { .name = "devgroup", .match = devgroup_mt, + .check_hooks = devgroup_mt_check_hooks, .checkentry = devgroup_mt_checkentry, .matchsize = sizeof(struct xt_devgroup_info), .family = NFPROTO_UNSPEC, diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index d2b0b52434fa..dd98f758176c 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -91,14 +91,10 @@ match_outdev: return (!!ret ^ !(info->invert & XT_PHYSDEV_OP_OUT)); } -static int physdev_mt_check(const struct xt_mtchk_param *par) +static int physdev_mt_check_hooks(const struct xt_mtchk_param *par) { const struct xt_physdev_info *info = par->matchinfo; - static bool brnf_probed __read_mostly; - if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || - info->bitmask & ~XT_PHYSDEV_OP_MASK) - return -EINVAL; if (info->bitmask & (XT_PHYSDEV_OP_OUT | XT_PHYSDEV_OP_ISOUT) && (!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) || info->invert & XT_PHYSDEV_OP_BRIDGED) && @@ -107,6 +103,18 @@ static int physdev_mt_check(const struct xt_mtchk_param *par) return -EINVAL; } + return 0; +} + +static int physdev_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_physdev_info *info = par->matchinfo; + static bool brnf_probed __read_mostly; + + if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || + info->bitmask & ~XT_PHYSDEV_OP_MASK) + return -EINVAL; + #define X(memb) strnlen(info->memb, sizeof(info->memb)) >= sizeof(info->memb) if (info->bitmask & XT_PHYSDEV_OP_IN) { if (info->physindev[0] == '\0') @@ -141,6 +149,7 @@ static struct xt_match physdev_mt_reg[] __read_mostly = { { .name = "physdev", .family = NFPROTO_IPV4, + .check_hooks = physdev_mt_check_hooks, .checkentry = physdev_mt_check, .match = physdev_mt, .matchsize = sizeof(struct xt_physdev_info), @@ -149,6 +158,7 @@ static struct xt_match physdev_mt_reg[] __read_mostly = { { .name = "physdev", .family = NFPROTO_IPV6, + .check_hooks = physdev_mt_check_hooks, .checkentry = physdev_mt_check, .match = physdev_mt, .matchsize = sizeof(struct xt_physdev_info), diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c index b5fa65558318..ff54e3a8581e 100644 --- a/net/netfilter/xt_policy.c +++ b/net/netfilter/xt_policy.c @@ -126,13 +126,10 @@ policy_mt(const struct sk_buff *skb, struct xt_action_param *par) return ret; } -static int policy_mt_check(const struct xt_mtchk_param *par) +static int policy_mt_check_hooks(const struct xt_mtchk_param *par) { const struct xt_policy_info *info = par->matchinfo; - const char *errmsg = "neither incoming nor outgoing policy selected"; - - if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) - goto err; + const char *errmsg; if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN)) && info->flags & XT_POLICY_MATCH_OUT) { @@ -144,6 +141,21 @@ static int policy_mt_check(const struct xt_mtchk_param *par) errmsg = "input policy not valid in POSTROUTING and OUTPUT"; goto err; } + + return 0; +err: + pr_info_ratelimited("%s\n", errmsg); + return -EINVAL; +} + +static int policy_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_policy_info *info = par->matchinfo; + const char *errmsg = "neither incoming nor outgoing policy selected"; + + if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) + goto err; + if (info->len > XT_POLICY_MAX_ELEM) { errmsg = "too many policy elements"; goto err; @@ -158,6 +170,7 @@ static struct xt_match policy_mt_reg[] __read_mostly = { { .name = "policy", .family = NFPROTO_IPV4, + .check_hooks = policy_mt_check_hooks, .checkentry = policy_mt_check, .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), @@ -166,6 +179,7 @@ static struct xt_match policy_mt_reg[] __read_mostly = { { .name = "policy", .family = NFPROTO_IPV6, + .check_hooks = policy_mt_check_hooks, .checkentry = policy_mt_check, .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c index 731bc2cafae4..4ae04bba9358 100644 --- a/net/netfilter/xt_set.c +++ b/net/netfilter/xt_set.c @@ -430,6 +430,29 @@ set_target_v3(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } +static int +set_target_v3_check_hooks(const struct xt_tgchk_param *par) +{ + const struct xt_set_info_target_v3 *info = par->targinfo; + + if (info->map_set.index != IPSET_INVALID_ID) { + if (strncmp(par->table, "mangle", 7)) { + pr_info_ratelimited("--map-set only usable from mangle table\n"); + return -EINVAL; + } + if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) | + (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) && + (par->hook_mask & ~(1 << NF_INET_FORWARD | + 1 << NF_INET_LOCAL_OUT | + 1 << NF_INET_POST_ROUTING))) { + pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n"); + return -EINVAL; + } + } + + return 0; +} + static int set_target_v3_checkentry(const struct xt_tgchk_param *par) { @@ -459,20 +482,6 @@ set_target_v3_checkentry(const struct xt_tgchk_param *par) } if (info->map_set.index != IPSET_INVALID_ID) { - if (strncmp(par->table, "mangle", 7)) { - pr_info_ratelimited("--map-set only usable from mangle table\n"); - ret = -EINVAL; - goto cleanup_del; - } - if (((info->flags & IPSET_FLAG_MAP_SKBPRIO) | - (info->flags & IPSET_FLAG_MAP_SKBQUEUE)) && - (par->hook_mask & ~(1 << NF_INET_FORWARD | - 1 << NF_INET_LOCAL_OUT | - 1 << NF_INET_POST_ROUTING))) { - pr_info_ratelimited("mapping of prio or/and queue is allowed only from OUTPUT/FORWARD/POSTROUTING chains\n"); - ret = -EINVAL; - goto cleanup_del; - } index = ip_set_nfnl_get_byindex(par->net, info->map_set.index); if (index == IPSET_INVALID_ID) { @@ -672,6 +681,7 @@ static struct xt_target set_targets[] __read_mostly = { .family = NFPROTO_IPV4, .target = set_target_v3, .targetsize = sizeof(struct xt_set_info_target_v3), + .check_hooks = set_target_v3_check_hooks, .checkentry = set_target_v3_checkentry, .destroy = set_target_v3_destroy, .me = THIS_MODULE @@ -682,6 +692,7 @@ static struct xt_target set_targets[] __read_mostly = { .family = NFPROTO_IPV6, .target = set_target_v3, .targetsize = sizeof(struct xt_set_info_target_v3), + .check_hooks = set_target_v3_check_hooks, .checkentry = set_target_v3_checkentry, .destroy = set_target_v3_destroy, .me = THIS_MODULE -- cgit v1.2.3 From 2f768d638d977eff824f64dcc9639e3fea32da8f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 28 Apr 2026 19:04:07 +0200 Subject: netfilter: nft_compat: run xt_check_hooks_{match,target}() from .validate Several matches and one target check that the hook is correct from checkentry(), however, the basechain is only available from nft_table_validate(). This patch uses xt_check_hooks_{match,target}() from the nft_compat expression .validate path. This patch sets the table in the nft_ctx struct in nft_table_validate() which is required by this patch. Based on patch from Florian Westphal. Fixes: 0ca743a55991 ("netfilter: nf_tables: add compatibility layer for x_tables") Reported-by: Xiang Mei Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 1 + net/netfilter/nft_compat.c | 45 +++++++++++++++++++++++++++++++++---------- 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d20ce5c36d31..38e33c66c618 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4205,6 +4205,7 @@ static int nft_table_validate(struct net *net, const struct nft_table *table) struct nft_chain *chain; struct nft_ctx ctx = { .net = net, + .table = (struct nft_table *)table, .family = table->family, }; int err = 0; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index decc725a33c2..0caa9304d2d0 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -261,10 +261,10 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return ret; } - nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv); - nft_compat_wait_for_destructors(ctx->net); + nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv); + ret = xt_check_target(&par, size, proto, inv); if (ret < 0) { if (ret == -ENOENT) { @@ -353,8 +353,6 @@ nla_put_failure: static int nft_target_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { - struct xt_target *target = expr->ops->data; - unsigned int hook_mask = 0; int ret; if (ctx->family != NFPROTO_IPV4 && @@ -377,11 +375,21 @@ static int nft_target_validate(const struct nft_ctx *ctx, const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); const struct nf_hook_ops *ops = &basechain->ops; + unsigned int hook_mask = 1 << ops->hooknum; + struct xt_target *target = expr->ops->data; + void *info = nft_expr_priv(expr); + struct xt_tgchk_param par; + union nft_entry e = {}; - hook_mask = 1 << ops->hooknum; if (target->hooks && !(hook_mask & target->hooks)) return -EINVAL; + nft_target_set_tgchk_param(&par, ctx, target, info, &e, 0, false); + + ret = xt_check_hooks_target(&par); + if (ret < 0) + return ret; + ret = nft_compat_chain_validate_dependency(ctx, target->table); if (ret < 0) return ret; @@ -515,10 +523,10 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return ret; } - nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv); - nft_compat_wait_for_destructors(ctx->net); + nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv); + return xt_check_match(&par, size, proto, inv); } @@ -614,8 +622,6 @@ static int nft_match_large_dump(struct sk_buff *skb, static int nft_match_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { - struct xt_match *match = expr->ops->data; - unsigned int hook_mask = 0; int ret; if (ctx->family != NFPROTO_IPV4 && @@ -638,11 +644,30 @@ static int nft_match_validate(const struct nft_ctx *ctx, const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); const struct nf_hook_ops *ops = &basechain->ops; + unsigned int hook_mask = 1 << ops->hooknum; + struct xt_match *match = expr->ops->data; + size_t size = XT_ALIGN(match->matchsize); + struct xt_mtchk_param par; + union nft_entry e = {}; + void *info; - hook_mask = 1 << ops->hooknum; if (match->hooks && !(hook_mask & match->hooks)) return -EINVAL; + if (NFT_EXPR_SIZE(size) > NFT_MATCH_LARGE_THRESH) { + struct nft_xt_match_priv *priv = nft_expr_priv(expr); + + info = priv->info; + } else { + info = nft_expr_priv(expr); + } + + nft_match_set_mtchk_param(&par, ctx, match, info, &e, 0, false); + + ret = xt_check_hooks_match(&par); + if (ret < 0) + return ret; + ret = nft_compat_chain_validate_dependency(ctx, match->table); if (ret < 0) return ret; -- cgit v1.2.3 From 8bedb6c46945752a688d9b0cf2021e0e68b1876c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 28 Apr 2026 19:37:57 +0200 Subject: netfilter: xt_CT: fix usersize for v1 and v2 revision While resurrecting the conntrack-tool test cases I found following bug: In: iptables -I OUTPUT -t raw -p 13 -j CT --timeout test-generic Out: [0:0] -A OUTPUT -p 13 -j CT --timeout test Data after first four bytes of the timeout policy name is never copied to userspace because its treated as kernel-only. Fixes: ec2318904965 ("xtables: extend matches and targets with .usersize") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_CT.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 498f5871c84a..d2aeacf94230 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -354,7 +354,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .family = NFPROTO_IPV4, .revision = 1, .targetsize = sizeof(struct xt_ct_target_info_v1), - .usersize = offsetof(struct xt_ct_target_info, ct), + .usersize = offsetof(struct xt_ct_target_info_v1, ct), .checkentry = xt_ct_tg_check_v1, .destroy = xt_ct_tg_destroy_v1, .target = xt_ct_target_v1, @@ -366,7 +366,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .family = NFPROTO_IPV4, .revision = 2, .targetsize = sizeof(struct xt_ct_target_info_v1), - .usersize = offsetof(struct xt_ct_target_info, ct), + .usersize = offsetof(struct xt_ct_target_info_v1, ct), .checkentry = xt_ct_tg_check_v2, .destroy = xt_ct_tg_destroy_v1, .target = xt_ct_target_v1, @@ -398,7 +398,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .family = NFPROTO_IPV6, .revision = 1, .targetsize = sizeof(struct xt_ct_target_info_v1), - .usersize = offsetof(struct xt_ct_target_info, ct), + .usersize = offsetof(struct xt_ct_target_info_v1, ct), .checkentry = xt_ct_tg_check_v1, .destroy = xt_ct_tg_destroy_v1, .target = xt_ct_target_v1, @@ -410,7 +410,7 @@ static struct xt_target xt_ct_tg_reg[] __read_mostly = { .family = NFPROTO_IPV6, .revision = 2, .targetsize = sizeof(struct xt_ct_target_info_v1), - .usersize = offsetof(struct xt_ct_target_info, ct), + .usersize = offsetof(struct xt_ct_target_info_v1, ct), .checkentry = xt_ct_tg_check_v2, .destroy = xt_ct_tg_destroy_v1, .target = xt_ct_target_v1, -- cgit v1.2.3 From 63bac027860308d1344f761cb47aabb3b30973fd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 29 Apr 2026 08:21:35 +0200 Subject: netfilter: nf_tables: fix netdev hook allocation memleak with dormant tables sashiko says: could the related code in __nf_tables_abort() leak the struct nft_hook objects when the table is dormant? In __nf_tables_abort(), when rolling back a NEWCHAIN transaction that updates hooks, the code conditionally unregisters and frees the hooks only if the table is not dormant [..] if (!(table->flags & NFT_TABLE_F_DORMANT)) { nft_netdev_unregister_hooks(net, &nft_trans_chain_hooks(trans), true); } ... nft_trans_destroy(trans); Unfortunately netdev family mixes hook registration and allocation. Push table struct down and only check for the flag to unregister. Fixes: 216e7bf7402c ("netfilter: nf_tables: skip netdev hook unregistration if table is dormant") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 38e33c66c618..87387adbca65 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -407,6 +407,7 @@ static void nft_netdev_unregister_trans_hook(struct net *net, } static void nft_netdev_unregister_hooks(struct net *net, + const struct nft_table *table, struct list_head *hook_list, bool release_netdev) { @@ -414,8 +415,10 @@ static void nft_netdev_unregister_hooks(struct net *net, struct nf_hook_ops *ops; list_for_each_entry_safe(hook, next, hook_list, list) { - list_for_each_entry(ops, &hook->ops_list, list) - nf_unregister_net_hook(net, ops); + if (!(table->flags & NFT_TABLE_F_DORMANT)) { + list_for_each_entry(ops, &hook->ops_list, list) + nf_unregister_net_hook(net, ops); + } if (release_netdev) nft_netdev_hook_unlink_free_rcu(hook); } @@ -452,20 +455,25 @@ static void __nf_tables_unregister_hook(struct net *net, struct nft_base_chain *basechain; const struct nf_hook_ops *ops; - if (table->flags & NFT_TABLE_F_DORMANT || - !nft_is_base_chain(chain)) + if (!nft_is_base_chain(chain)) return; basechain = nft_base_chain(chain); ops = &basechain->ops; + /* must also be called for dormant tables */ + if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) { + nft_netdev_unregister_hooks(net, table, &basechain->hook_list, + release_netdev); + return; + } + + if (table->flags & NFT_TABLE_F_DORMANT) + return; + if (basechain->type->ops_unregister) return basechain->type->ops_unregister(net, ops); - if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) - nft_netdev_unregister_hooks(net, &basechain->hook_list, - release_netdev); - else - nf_unregister_net_hook(net, &basechain->ops); + nf_unregister_net_hook(net, &basechain->ops); } static void nf_tables_unregister_hook(struct net *net, @@ -11282,11 +11290,9 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) { - if (!(table->flags & NFT_TABLE_F_DORMANT)) { - nft_netdev_unregister_hooks(net, - &nft_trans_chain_hooks(trans), - true); - } + nft_netdev_unregister_hooks(net, table, + &nft_trans_chain_hooks(trans), + true); free_percpu(nft_trans_chain_stats(trans)); kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); -- cgit v1.2.3 From 0bf00859d7a5ab685901c36f29df063b825cfaaa Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Tue, 28 Apr 2026 12:25:46 +0200 Subject: netfilter: nf_socket: skip socket lookup for non-first fragments Both nft_socket and xt_socket relies on L4 headers to perform socket lookup in the slow path. For fragmented packets, while the IP protocol remains constant across all fragments, only the first fragment contains the actual L4 header. As the expression/match could be attached to a chain with a priority lower than -400, it could bypass defragmentation. Add a check for fragmentation in the lookup functions directly so the problem is handled for both nft_socket and xt_socket at the same time. In addition, future users of the functions would not need to care about this. Fixes: 902d6a4c2a4f ("netfilter: nf_defrag: Skip defrag if NOTRACK is set") Fixes: 554ced0a6e29 ("netfilter: nf_tables: add support for native socket matching") Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_socket_ipv4.c | 3 +++ net/ipv6/netfilter/nf_socket_ipv6.c | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c index 5080fa5fbf6a..f9c6755f5ec5 100644 --- a/net/ipv4/netfilter/nf_socket_ipv4.c +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -94,6 +94,9 @@ struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb, #endif int doff = 0; + if (ntohs(iph->frag_off) & IP_OFFSET) + return NULL; + if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) { struct tcphdr _hdr; struct udphdr *hp; diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index ced8bd44828e..893f2aeb4711 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -100,6 +100,7 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, const struct in6_addr *daddr = NULL, *saddr = NULL; struct ipv6hdr *iph = ipv6_hdr(skb), ipv6_var; struct sk_buff *data_skb = NULL; + unsigned short fragoff = 0; int doff = 0; int thoff = 0, tproto; #if IS_ENABLED(CONFIG_NF_CONNTRACK) @@ -107,8 +108,8 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, struct nf_conn const *ct; #endif - tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); - if (tproto < 0) { + tproto = ipv6_find_hdr(skb, &thoff, -1, &fragoff, NULL); + if (tproto < 0 || fragoff) { pr_debug("unable to find transport header in IPv6 packet, dropping\n"); return NULL; } -- cgit v1.2.3 From 009d203e56dbe8db2589455b9e3644955f30313a Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Tue, 28 Apr 2026 12:25:47 +0200 Subject: netfilter: nf_tables: skip L4 header parsing for non-first fragments The tproxy, osf and exthdr (SCTP) expressions rely on the presence of transport layer headers to perform socket lookups, fingerprint matching, or chunk extraction. For fragmented packets, while the IP protocol remains constant across all fragments, only the first fragment contains the actual L4 header. The expressions could be attached to a chain with a priority lower than -400, bypassing defragmentation. Or could be used in stateless environments where defragmentation is not happening at all. This could result in garbage data being used for the matching. Add a check for pkt->fragoff so only unfragmented packets or the first fragment is processed. Fixes: 133dc203d77d ("netfilter: nft_exthdr: Support SCTP chunks") Fixes: 4ed8eb6570a4 ("netfilter: nf_tables: Add native tproxy support") Fixes: b96af92d6eaf ("netfilter: nf_tables: implement Passive OS fingerprint module in nft_osf") Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_core.c | 2 +- net/netfilter/nft_exthdr.c | 2 +- net/netfilter/nft_osf.c | 2 +- net/netfilter/nft_tproxy.c | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 5ddd5b6e135f..8ab186f86dd4 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -153,7 +153,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, if (priv->base == NFT_PAYLOAD_NETWORK_HEADER) ptr = skb_network_header(skb) + pkt->nhoff; else { - if (!(pkt->flags & NFT_PKTINFO_L4PROTO)) + if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff) return false; ptr = skb->data + nft_thoff(pkt); } diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 0407d6f708ae..e6a07c0df207 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -376,7 +376,7 @@ static void nft_exthdr_sctp_eval(const struct nft_expr *expr, const struct sctp_chunkhdr *sch; struct sctp_chunkhdr _sch; - if (pkt->tprot != IPPROTO_SCTP) + if (pkt->tprot != IPPROTO_SCTP || pkt->fragoff) goto err; do { diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index c02d5cb52143..45fe56da5044 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -33,7 +33,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, return; } - if (pkt->tprot != IPPROTO_TCP) { + if (pkt->tprot != IPPROTO_TCP || pkt->fragoff) { regs->verdict.code = NFT_BREAK; return; } diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index f2101af8c867..89be443734f6 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -30,8 +30,8 @@ static void nft_tproxy_eval_v4(const struct nft_expr *expr, __be16 tport = 0; struct sock *sk; - if (pkt->tprot != IPPROTO_TCP && - pkt->tprot != IPPROTO_UDP) { + if ((pkt->tprot != IPPROTO_TCP && + pkt->tprot != IPPROTO_UDP) || pkt->fragoff) { regs->verdict.code = NFT_BREAK; return; } @@ -97,8 +97,8 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr, memset(&taddr, 0, sizeof(taddr)); - if (pkt->tprot != IPPROTO_TCP && - pkt->tprot != IPPROTO_UDP) { + if ((pkt->tprot != IPPROTO_TCP && + pkt->tprot != IPPROTO_UDP) || pkt->fragoff) { regs->verdict.code = NFT_BREAK; return; } -- cgit v1.2.3 From 952e121c96137c73bd3e59bb20a93ef659376947 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Tue, 28 Apr 2026 12:25:48 +0200 Subject: netfilter: xtables: fix L4 header parsing for non-first fragments Multiple targets and matches relies on L4 header to operate. For fragmented packets, every fragment carries the transport protocol identifier, but only the first fragment contains the L4 header. As the 'raw' table can be configured to run at priority -450 (before defragmentation at -400), the target/match can be reached before reassembly. In this case, non-first fragments have their payload incorrectly parsed as a TCP/UDP header. This would be of course a misconfiguration scenario. In most of the cases this just lead to a unreliable behavior for fragmented traffic. Add a fragment check to ensure target/match only evaluates unfragmented packets or the first fragment in the stream. Fixes: 902d6a4c2a4f ("netfilter: nf_defrag: Skip defrag if NOTRACK is set") Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_TPROXY.c | 11 +++++++++-- net/netfilter/xt_ecn.c | 4 ++++ net/netfilter/xt_hashlimit.c | 4 +++- net/netfilter/xt_osf.c | 3 +++ net/netfilter/xt_tcpmss.c | 4 ++++ 5 files changed, 23 insertions(+), 3 deletions(-) diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index e4bea1d346cf..5f60e7298a1e 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -86,6 +86,9 @@ tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tproxy_target_info *tgi = par->targinfo; + if (par->fragoff) + return NF_DROP; + return tproxy_tg4(xt_net(par), skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value); } @@ -95,6 +98,9 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; + if (par->fragoff) + return NF_DROP; + return tproxy_tg4(xt_net(par), skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value); } @@ -106,6 +112,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; + unsigned short fragoff = 0; struct udphdr _hdr, *hp; struct sock *sk; const struct in6_addr *laddr; @@ -113,8 +120,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) int thoff = 0; int tproto; - tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); - if (tproto < 0) + tproto = ipv6_find_hdr(skb, &thoff, -1, &fragoff, NULL); + if (tproto < 0 || fragoff) return NF_DROP; hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); diff --git a/net/netfilter/xt_ecn.c b/net/netfilter/xt_ecn.c index b96e8203ac54..a8503f5d26bf 100644 --- a/net/netfilter/xt_ecn.c +++ b/net/netfilter/xt_ecn.c @@ -30,6 +30,10 @@ static bool match_tcp(const struct sk_buff *skb, struct xt_action_param *par) struct tcphdr _tcph; const struct tcphdr *th; + /* this is fine for IPv6 as ecn_mt_check6() enforces -p tcp */ + if (par->fragoff) + return false; + /* In practice, TCP match does this, so can't fail. But let's * be good citizens. */ diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 3bd127bfc114..2704b4b60d1e 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -658,6 +658,8 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, if (!(hinfo->cfg.mode & (XT_HASHLIMIT_HASH_DPT | XT_HASHLIMIT_HASH_SPT))) return 0; + if (ntohs(ip_hdr(skb)->frag_off) & IP_OFFSET) + return -1; nexthdr = ip_hdr(skb)->protocol; break; #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) @@ -681,7 +683,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, return 0; nexthdr = ipv6_hdr(skb)->nexthdr; protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); - if ((int)protoff < 0) + if ((int)protoff < 0 || ntohs(frag_off) & IP6_OFFSET) return -1; break; } diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index dc9485854002..e8807caede68 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -27,6 +27,9 @@ static bool xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) { + if (p->fragoff) + return false; + return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p), xt_out(p), p->matchinfo, xt_net(p), nf_osf_fingers); } diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c index 0d32d4841cb3..b9da8269161d 100644 --- a/net/netfilter/xt_tcpmss.c +++ b/net/netfilter/xt_tcpmss.c @@ -32,6 +32,10 @@ tcpmss_mt(const struct sk_buff *skb, struct xt_action_param *par) u8 _opt[15 * 4 - sizeof(_tcph)]; unsigned int i, optlen; + /* this is fine for IPv6 as xt_tcpmss enforces -p tcp */ + if (par->fragoff) + return false; + /* If we don't have the whole header, drop packet. */ th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); if (th == NULL) -- cgit v1.2.3 From ef4f741e8627512cb8c82f59a1fc7aacd854aadf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 30 Apr 2026 16:49:48 +0200 Subject: netfilter: flowtable: ensure sufficient headroom in xmit path Check for headroom and call skb_expand_head() like in the IP output path to ensure there is sufficient headroom for the mac header when forwarding this packet as suggested by sashiko. Fixes: b5964aac51e0 ("netfilter: flowtable: consolidate xmit path") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index dbd7644fdbeb..8d5fb7e940a1 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -471,8 +471,17 @@ struct nf_flow_xmit { static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, struct nf_flow_xmit *xmit) { - skb->dev = xmit->outdev; - dev_hard_header(skb, skb->dev, ntohs(skb->protocol), + struct net_device *dev = xmit->outdev; + unsigned int hh_len = LL_RESERVED_SPACE(dev); + + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { + skb = skb_expand_head(skb, hh_len); + if (!skb) + return NF_STOLEN; + } + + skb->dev = dev; + dev_hard_header(skb, dev, ntohs(skb->protocol), xmit->dest, xmit->source, skb->len); dev_queue_xmit(skb); -- cgit v1.2.3 From 18ed60e33e6c77d62409c1343dec1c61bae3d2e7 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 29 Apr 2026 16:21:41 +0800 Subject: net: mctp: test: use a zeroed struct sockaddr_mctp Invalid sockaddr padding will cause bind() to fail; ensure we have a zeroed address in the testcase. Fixes: 0d8647bc74cb ("net: mctp: don't require a route for null-EID ingress") Signed-off-by: Jeremy Kerr Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260429-dev-mctp-test-fixes-v1-1-1127b7425809@codeconstruct.com.au Signed-off-by: Jakub Kicinski --- net/mctp/test/route-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c index e1033643fab0..e4b230ef6099 100644 --- a/net/mctp/test/route-test.c +++ b/net/mctp/test/route-test.c @@ -920,9 +920,9 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test) static void mctp_test_route_input_null_eid(struct kunit *test) { struct mctp_hdr hdr = RX_HDR(1, 10, 0, FL_S | FL_E | FL_TO); + struct sockaddr_mctp addr = { 0 }; struct sk_buff *skb_pkt, *skb_sk; struct mctp_test_dev *dev; - struct sockaddr_mctp addr; struct socket *sock; u8 type = 0; int rc; -- cgit v1.2.3 From 76872971064133474d9b891da05db8f7586fcc11 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 29 Apr 2026 16:21:42 +0800 Subject: net: mctp: test: Use dev_direct_xmit for TX to our test device In our test cases, we typically feed a packet sequence into the routing code, then inspect the device's TXed skbs to assert specific behaviours. Using dev_queue_xmit() for our TX path introduces a fair bit of complexity between the test packet sequence and the test device's ndo_start_xmit callback; which may mean that the skbs have not hit the device at the point we're inspecting the TXed skb list. Use dev_direct_xmit instead, as we want a direct a path as possible here, and the test dev does not need any queueing, scheduling or flow control. Fixes: 6ab578739a4c ("net: mctp: test: move TX packetqueue from dst to dev") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202604281320.525eee17-lkp@intel.com Signed-off-by: Jeremy Kerr Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260429-dev-mctp-test-fixes-v1-2-1127b7425809@codeconstruct.com.au Signed-off-by: Jakub Kicinski --- net/mctp/test/utils.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mctp/test/utils.c b/net/mctp/test/utils.c index c3987d5ade7a..6eef8d485c25 100644 --- a/net/mctp/test/utils.c +++ b/net/mctp/test/utils.c @@ -116,7 +116,7 @@ void mctp_test_destroy_dev(struct mctp_test_dev *dev) static int mctp_test_dst_output(struct mctp_dst *dst, struct sk_buff *skb) { skb->dev = dst->dev->dev; - dev_queue_xmit(skb); + dev_direct_xmit(skb, 0); return 0; } -- cgit v1.2.3 From a177ae30f78688f75ef9c6277a152c5d6979b10e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 30 Apr 2026 16:49:51 +0200 Subject: netfilter: flowtable: fix inline vlan encapsulation in xmit path Several issues in the inline vlan support: - The layer 2 encapsulation representation in the tuple takes encap[0] as the outer header and encap[1] as the inner header as seen from the ingress path. Reverse the encap loop to push first the inner then the outer vlan header. - Postpone pushing the layer 2 header once destination device is known. This allows to calculate the needed hearoom via LL_RESERVED_SPACE to accommodate the layer 2 headers. - Add and use nf_flow_vlan_push() as suggested by Eric Woudstra, this is a simplified version of skb_vlan_push() for egress path only. Fixes: c653d5a78f34 ("netfilter: flowtable: inline vlan encapsulation in xmit path") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 110 ++++++++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 37 deletions(-) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 8d5fb7e940a1..0ce3c209050c 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -462,32 +462,6 @@ static void nf_flow_encap_pop(struct nf_flowtable_ctx *ctx, nf_flow_ip_tunnel_pop(ctx, skb); } -struct nf_flow_xmit { - const void *dest; - const void *source; - struct net_device *outdev; -}; - -static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, - struct nf_flow_xmit *xmit) -{ - struct net_device *dev = xmit->outdev; - unsigned int hh_len = LL_RESERVED_SPACE(dev); - - if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { - skb = skb_expand_head(skb, hh_len); - if (!skb) - return NF_STOLEN; - } - - skb->dev = dev; - dev_hard_header(skb, dev, ntohs(skb->protocol), - xmit->dest, xmit->source, skb->len); - dev_queue_xmit(skb); - - return NF_STOLEN; -} - static struct flow_offload_tuple_rhash * nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx, struct nf_flowtable *flow_table, struct sk_buff *skb) @@ -553,6 +527,32 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, return 1; } +/* Similar to skb_vlan_push. */ +static int nf_flow_vlan_push(struct sk_buff *skb, __be16 proto, u16 id, + u32 needed_headroom) +{ + if (skb_vlan_tag_present(skb)) { + struct vlan_hdr *vhdr; + + if (skb_cow_head(skb, needed_headroom + VLAN_HLEN)) + return -1; + + __skb_push(skb, VLAN_HLEN); + if (skb_mac_header_was_set(skb)) + skb->mac_header -= VLAN_HLEN; + + vhdr = (struct vlan_hdr *)skb->data; + skb->network_header -= VLAN_HLEN; + vhdr->h_vlan_TCI = htons(skb_vlan_tag_get(skb)); + vhdr->h_vlan_encapsulated_proto = skb->protocol; + skb->protocol = skb->vlan_proto; + skb_postpush_rcsum(skb, skb->data, VLAN_HLEN); + } + __vlan_hwaccel_put_tag(skb, proto, id); + + return 0; +} + static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id) { int data_len = skb->len + sizeof(__be16); @@ -739,17 +739,19 @@ static int nf_flow_tunnel_v6_push(struct net *net, struct sk_buff *skb, } static int nf_flow_encap_push(struct sk_buff *skb, - struct flow_offload_tuple *tuple) + struct flow_offload_tuple *tuple, + struct net_device *outdev) { + u32 needed_headroom = LL_RESERVED_SPACE(outdev); int i; - for (i = 0; i < tuple->encap_num; i++) { + for (i = tuple->encap_num - 1; i >= 0; i--) { switch (tuple->encap[i].proto) { case htons(ETH_P_8021Q): case htons(ETH_P_8021AD): - skb_reset_mac_header(skb); - if (skb_vlan_push(skb, tuple->encap[i].proto, - tuple->encap[i].id) < 0) + if (nf_flow_vlan_push(skb, tuple->encap[i].proto, + tuple->encap[i].id, + needed_headroom) < 0) return -1; break; case htons(ETH_P_PPP_SES): @@ -762,6 +764,44 @@ static int nf_flow_encap_push(struct sk_buff *skb, return 0; } +struct nf_flow_xmit { + const void *dest; + const void *source; + struct net_device *outdev; + struct flow_offload_tuple *tuple; +}; + +static void __nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, + struct nf_flow_xmit *xmit) +{ + struct net_device *dev = xmit->outdev; + unsigned int hh_len = LL_RESERVED_SPACE(dev); + + if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { + skb = skb_expand_head(skb, hh_len); + if (!skb) + return; + } + + skb->dev = dev; + dev_hard_header(skb, dev, ntohs(skb->protocol), + xmit->dest, xmit->source, skb->len); + dev_queue_xmit(skb); +} + +static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, + struct nf_flow_xmit *xmit) +{ + if (xmit->tuple->encap_num) { + if (nf_flow_encap_push(skb, xmit->tuple, xmit->outdev) < 0) + return NF_DROP; + } + + __nf_flow_queue_xmit(net, skb, xmit); + + return NF_STOLEN; +} + unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -806,9 +846,6 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (nf_flow_tunnel_v4_push(state->net, skb, other_tuple, &ip_daddr) < 0) return NF_DROP; - if (nf_flow_encap_push(skb, other_tuple) < 0) - return NF_DROP; - switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = dst_rtable(tuplehash->tuple.dst_cache); @@ -838,6 +875,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, WARN_ON_ONCE(1); return NF_DROP; } + xmit.tuple = other_tuple; return nf_flow_queue_xmit(state->net, skb, &xmit); } @@ -1128,9 +1166,6 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, &ip6_daddr, encap_limit) < 0) return NF_DROP; - if (nf_flow_encap_push(skb, other_tuple) < 0) - return NF_DROP; - switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = dst_rt6_info(tuplehash->tuple.dst_cache); @@ -1160,6 +1195,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, WARN_ON_ONCE(1); return NF_DROP; } + xmit.tuple = other_tuple; return nf_flow_queue_xmit(state->net, skb, &xmit); } -- cgit v1.2.3 From 69c54f80f4a7072b51b5b5939185ca5e572be982 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 30 Apr 2026 16:49:53 +0200 Subject: netfilter: flowtable: fix inline pppoe encapsulation in xmit path Address two issues in the inline pppoe encapsulation: - Add needs_gso_segment flag to segment PPPoE packets in software given that there is no GSO support for this. - Use FLOW_OFFLOAD_XMIT_DIRECT since neighbour cache is not available in point-to-point device, use the hardware address that is obtained via flowtable path discovery (ie. fill_forward_path). Fixes: 18d27bed0880 ("netfilter: flowtable: inline pppoe encapsulation in xmit path") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 4 +++- net/netfilter/nf_flow_table_core.c | 1 + net/netfilter/nf_flow_table_ip.c | 42 ++++++++++++++++++++++++++++++++--- net/netfilter/nf_flow_table_path.c | 7 +++++- 4 files changed, 49 insertions(+), 5 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index b09c11c048d5..7b23b245a5a8 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -148,9 +148,10 @@ struct flow_offload_tuple { /* All members above are keys for lookups, see flow_offload_hash(). */ struct { } __hash; - u8 dir:2, + u16 dir:2, xmit_type:3, encap_num:2, + needs_gso_segment:1, tun_num:2, in_vlan_ingress:2; u16 mtu; @@ -232,6 +233,7 @@ struct nf_flow_route { u32 hw_ifindex; u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; + u8 needs_gso_segment:1; } out; enum flow_offload_xmit_type xmit_type; } tuple[FLOW_OFFLOAD_DIR_MAX]; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 2c4140e6f53c..785d8c244a77 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -122,6 +122,7 @@ static int flow_offload_fill_route(struct flow_offload *flow, flow_tuple->tun = route->tuple[dir].in.tun; flow_tuple->encap_num = route->tuple[dir].in.num_encaps; + flow_tuple->needs_gso_segment = route->tuple[dir].out.needs_gso_segment; flow_tuple->tun_num = route->tuple[dir].in.num_tuns; switch (route->tuple[dir].xmit_type) { diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 0ce3c209050c..2eba64eb393a 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -553,7 +553,8 @@ static int nf_flow_vlan_push(struct sk_buff *skb, __be16 proto, u16 id, return 0; } -static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id) +static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id, + u32 needed_headroom) { int data_len = skb->len + sizeof(__be16); struct ppp_hdr { @@ -562,7 +563,7 @@ static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id) } *ph; __be16 proto; - if (skb_cow_head(skb, PPPOE_SES_HLEN)) + if (skb_cow_head(skb, needed_headroom + PPPOE_SES_HLEN)) return -1; switch (skb->protocol) { @@ -755,7 +756,8 @@ static int nf_flow_encap_push(struct sk_buff *skb, return -1; break; case htons(ETH_P_PPP_SES): - if (nf_flow_pppoe_push(skb, tuple->encap[i].id) < 0) + if (nf_flow_pppoe_push(skb, tuple->encap[i].id, + needed_headroom) < 0) return -1; break; } @@ -769,6 +771,7 @@ struct nf_flow_xmit { const void *source; struct net_device *outdev; struct flow_offload_tuple *tuple; + bool needs_gso_segment; }; static void __nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, @@ -789,10 +792,41 @@ static void __nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, dev_queue_xmit(skb); } +static unsigned int nf_flow_encap_gso_xmit(struct net *net, struct sk_buff *skb, + struct nf_flow_xmit *xmit) +{ + struct sk_buff *segs, *nskb; + + segs = skb_gso_segment(skb, 0); + if (IS_ERR(segs)) + return NF_DROP; + + if (segs) + consume_skb(skb); + else + segs = skb; + + skb_list_walk_safe(segs, segs, nskb) { + skb_mark_not_on_list(segs); + + if (nf_flow_encap_push(segs, xmit->tuple, xmit->outdev) < 0) { + kfree_skb(segs); + kfree_skb_list(nskb); + return NF_STOLEN; + } + __nf_flow_queue_xmit(net, segs, xmit); + } + + return NF_STOLEN; +} + static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, struct nf_flow_xmit *xmit) { if (xmit->tuple->encap_num) { + if (skb_is_gso(skb) && xmit->needs_gso_segment) + return nf_flow_encap_gso_xmit(net, skb, xmit); + if (nf_flow_encap_push(skb, xmit->tuple, xmit->outdev) < 0) return NF_DROP; } @@ -876,6 +910,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, return NF_DROP; } xmit.tuple = other_tuple; + xmit.needs_gso_segment = tuplehash->tuple.needs_gso_segment; return nf_flow_queue_xmit(state->net, skb, &xmit); } @@ -1196,6 +1231,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, return NF_DROP; } xmit.tuple = other_tuple; + xmit.needs_gso_segment = tuplehash->tuple.needs_gso_segment; return nf_flow_queue_xmit(state->net, skb, &xmit); } diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c index 6bb9579dcc2a..9e88ea6a2eef 100644 --- a/net/netfilter/nf_flow_table_path.c +++ b/net/netfilter/nf_flow_table_path.c @@ -86,6 +86,7 @@ struct nft_forward_info { u8 ingress_vlans; u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; + bool needs_gso_segment; enum flow_offload_xmit_type xmit_type; }; @@ -138,8 +139,11 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack, path->encap.proto; info->num_encaps++; } - if (path->type == DEV_PATH_PPPOE) + if (path->type == DEV_PATH_PPPOE) { memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; + info->needs_gso_segment = 1; + } break; case DEV_PATH_BRIDGE: if (is_zero_ether_addr(info->h_source)) @@ -279,6 +283,7 @@ static void nft_dev_forward_path(const struct nft_pktinfo *pkt, memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); route->tuple[dir].xmit_type = info.xmit_type; } + route->tuple[dir].out.needs_gso_segment = info.needs_gso_segment; } int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, -- cgit v1.2.3 From e027c218c482c6a0ae1948129ccda3b0a2033368 Mon Sep 17 00:00:00 2001 From: Robert Marko Date: Tue, 28 Apr 2026 15:41:01 +0200 Subject: net: phy: micrel: fix LAN8814 QSGMII soft reset LAN8814 QSGMII soft reset was moved into the probe function to avoid triggering it for each of 4 PHY-s in the package. However, that broke QSGMII link between the MAC and PHY on most LAN8814 PHY-s, specificaly for us on the Microchip LAN969x switch. Reading the QSGMII status registers it was visible that lanes were only partially synced. It looks like the reset timing is crucial, so lets move the reset back into the .config_init function but guard it with phy_package_init_once() to avoid it being triggered on each of 4 PHY-s in the package. Change the probe function to use phy_package_probe_once() for coma and PtP setup. Fixes: 96a9178a29a6 ("net: phy: micrel: lan8814 fix reset of the QSGMII interface") Signed-off-by: Robert Marko Link: https://patch.msgid.link/20260428134138.1741253-1-robert.marko@sartura.hr Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 2aa1dedd21b8..e211a523c258 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -4548,6 +4548,13 @@ static int lan8814_config_init(struct phy_device *phydev) struct kszphy_priv *lan8814 = phydev->priv; int ret; + if (phy_package_init_once(phydev)) + /* Reset the PHY */ + lanphy_modify_page_reg(phydev, LAN8814_PAGE_COMMON_REGS, + LAN8814_QSGMII_SOFT_RESET, + LAN8814_QSGMII_SOFT_RESET_BIT, + LAN8814_QSGMII_SOFT_RESET_BIT); + /* Based on the interface type select how the advertise ability is * encoded, to set as SGMII or as USGMII. */ @@ -4655,13 +4662,7 @@ static int lan8814_probe(struct phy_device *phydev) priv->is_ptp_available = err == LAN8814_REV_LAN8814 || err == LAN8814_REV_LAN8818; - if (phy_package_init_once(phydev)) { - /* Reset the PHY */ - lanphy_modify_page_reg(phydev, LAN8814_PAGE_COMMON_REGS, - LAN8814_QSGMII_SOFT_RESET, - LAN8814_QSGMII_SOFT_RESET_BIT, - LAN8814_QSGMII_SOFT_RESET_BIT); - + if (phy_package_probe_once(phydev)) { err = lan8814_release_coma_mode(phydev); if (err) return err; -- cgit v1.2.3 From 3744b0964d5267c0b651bcd8f8c25db6bf4ccbac Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 29 Apr 2026 17:46:48 +0200 Subject: ipv6: Implement limits on extension header parsing ipv6_{skip_exthdr,find_hdr}() and ip6_{tnl_parse_tlv_enc_lim, protocol_deliver_rcu}() iterate over IPv6 extension headers until they find a non-extension-header protocol or run out of packet data. The loops have no iteration counter, relying solely on the packet length to bound them. For a crafted packet with 8-byte extension headers filling a 64KB jumbogram, this means a worst case of up to ~8k iterations with a skb_header_pointer call each. ipv6_skip_exthdr(), for example, is used where it parses the inner quoted packet inside an incoming ICMPv6 error: - icmpv6_rcv - checksum validation - case ICMPV6_DEST_UNREACH - icmpv6_notify - pskb_may_pull() <- pull inner IPv6 header - ipv6_skip_exthdr() <- iterates here - pskb_may_pull() - ipprot->err_handler() <- sk lookup The per-iteration cost of ipv6_skip_exthdr itself is generally light, but skb_header_pointer becomes more costly on reassembled packets: the first ~1232 bytes of the inner packet are in the skb's linear area, but the remaining ~63KB are in the frag_list where skb_copy_bits is needed to read data. Initially, the idea was to add a configurable limit via a new sysctl knob with default 8, in line with knobs from commit 47d3d7ac656a ("ipv6: Implement limits on Hop-by-Hop and Destination options"), but two reasons eventually argued against it: - It adds to UAPI that needs to be maintained forever, and upcoming work is restricting extension header ordering anyway, leaving little reason for another sysctl knob - exthdrs_core.c is always built-in even when CONFIG_IPV6=n, where struct net has no .ipv6 member, so the read site would need an ifdef'd fallback to a constant anyway Therefore, just use a constant (IP6_MAX_EXT_HDRS_CNT). All four extension header walking functions are now bound by this limit. Note that the check in ip6_protocol_deliver_rcu() happens right before the goto resubmit, such that we don't have to have a test for ipv6_ext_hdr() in the fast-path. There's an ongoing IETF draft-iurman-6man-eh-occurrences to enforce IPv6 extension headers ordering and occurrence. The latter also discusses security implications. As per RFC8200 section 4.1, the occurrence rules for extension headers provide a practical upper bound which is 8. In order to be conservative, let's define IP6_MAX_EXT_HDRS_CNT as 12 to leave enough room for quirky setups. In the unlikely event that this is still not enough, then we might need to reconsider a sysctl. Signed-off-by: Daniel Borkmann Reviewed-by: Ido Schimmel Reviewed-by: Eric Dumazet Reviewed-by: Justin Iurman Link: https://patch.msgid.link/20260429154648.809751-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 6 ++++++ include/net/ipv6.h | 3 +++ net/ipv6/exthdrs_core.c | 7 +++++++ net/ipv6/ip6_input.c | 5 +++++ net/ipv6/ip6_tunnel.c | 4 ++++ 5 files changed, 25 insertions(+) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index e0ca3904ff8e..2f312d1f67d6 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -99,6 +99,7 @@ FN(FRAG_TOO_FAR) \ FN(TCP_MINTTL) \ FN(IPV6_BAD_EXTHDR) \ + FN(IPV6_TOO_MANY_EXTHDRS) \ FN(IPV6_NDISC_FRAG) \ FN(IPV6_NDISC_HOP_LIMIT) \ FN(IPV6_NDISC_BAD_CODE) \ @@ -494,6 +495,11 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_MINTTL, /** @SKB_DROP_REASON_IPV6_BAD_EXTHDR: Bad IPv6 extension header. */ SKB_DROP_REASON_IPV6_BAD_EXTHDR, + /** + * @SKB_DROP_REASON_IPV6_TOO_MANY_EXTHDRS: Number of IPv6 extension + * headers in the packet exceeds IP6_MAX_EXT_HDRS_CNT. + */ + SKB_DROP_REASON_IPV6_TOO_MANY_EXTHDRS, /** @SKB_DROP_REASON_IPV6_NDISC_FRAG: invalid frag (suppress_frag_ndisc). */ SKB_DROP_REASON_IPV6_NDISC_FRAG, /** @SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT: invalid hop limit. */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index d042afe7a245..1dec81faff28 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -90,6 +90,9 @@ struct ip_tunnel_info; #define IP6_DEFAULT_MAX_DST_OPTS_LEN INT_MAX /* No limit */ #define IP6_DEFAULT_MAX_HBH_OPTS_LEN INT_MAX /* No limit */ +/* Hard limit on traversed IPv6 extension headers */ +#define IP6_MAX_EXT_HDRS_CNT 12 + /* * Addr type * diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index 49e31e4ae7b7..9d06d487e8b1 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -73,6 +73,7 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, __be16 *frag_offp) { u8 nexthdr = *nexthdrp; + int exthdr_cnt = 0; *frag_offp = 0; @@ -82,6 +83,8 @@ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, if (nexthdr == NEXTHDR_NONE) return -1; + if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT)) + return -1; hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (!hp) return -1; @@ -190,6 +193,7 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, { unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); u8 nexthdr = ipv6_hdr(skb)->nexthdr; + int exthdr_cnt = 0; bool found; if (fragoff) @@ -216,6 +220,9 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, return -ENOENT; } + if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT)) + return -EBADMSG; + hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (!hp) return -EBADMSG; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 967b07aeb683..8972863c93ee 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -403,6 +403,7 @@ INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *)); void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr, bool have_final) { + int exthdr_cnt = IP6CB(skb)->flags & IP6SKB_HOPBYHOP ? 1 : 0; const struct inet6_protocol *ipprot; struct inet6_dev *idev; unsigned int nhoff; @@ -487,6 +488,10 @@ resubmit_final: nexthdr = ret; goto resubmit_final; } else { + if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT)) { + SKB_DR_SET(reason, IPV6_TOO_MANY_EXTHDRS); + goto discard; + } goto resubmit; } } else if (ret == 0) { diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index c468c83af0f2..9d1037ac082f 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -399,11 +399,15 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) unsigned int nhoff = raw - skb->data; unsigned int off = nhoff + sizeof(*ipv6h); u8 nexthdr = ipv6h->nexthdr; + int exthdr_cnt = 0; while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { struct ipv6_opt_hdr *hdr; u16 optlen; + if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT)) + break; + if (!pskb_may_pull(skb, off + sizeof(*hdr))) break; -- cgit v1.2.3 From 26ebd12e67bfc3543d77ce586c33ef29fcafab20 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Wed, 29 Apr 2026 16:19:30 +0800 Subject: net: enetc: fix VSI mailbox timeout handling and DMA lifecycle In the current VSI mailbox implementation, the VSI allocates a DMA buffer to store the message sent to the PSI. When the PSI receives the message request from the VSI, the hardware copies the message data from this DMA buffer to PSI's DMA buffer for processing. When enetc_msg_vsi_send() times out, two scenarios can occur: 1) Use-after-free: If the hardware hasn't completed message copying when the VSI frees the buffer, the hardware may subsequently copy the data from freed memory to PSI's DMA buffer. 2) Message race: If PSI hasn't processed the previous message when the next message is sent, the VSI may receive the previous message's reply, leading to incorrect handling. To address these issues, implement the following changes: - Check the mailbox busy status before sending a new message. If the mailbox is in busy state, it indicates the previous message is still being processed, so return an error immediately. - Add the 'msg' field to struct enetc_si to preserve the DMA buffer information. The caller of enetc_msg_vsi_send() no longer frees the DMA buffer. Instead, defer freeing until it is safe to do so (when mailbox is not busy on next send). - Add cleanup in enetc_vf_remove() to free the last message buffer. This ensures the DMA buffer remains valid during message copying and prevents message reply mismatches. Fixes: beb74ac878c8 ("enetc: Add vf to pf messaging support") Signed-off-by: Wei Fang Link: https://patch.msgid.link/20260429081930.3259824-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.h | 1 + drivers/net/ethernet/freescale/enetc/enetc_vf.c | 42 ++++++++++++++++++++----- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h index e663bb5e614e..e691144e8756 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.h +++ b/drivers/net/ethernet/freescale/enetc/enetc.h @@ -330,6 +330,7 @@ struct enetc_si { struct workqueue_struct *workqueue; struct work_struct rx_mode_task; struct dentry *debugfs_root; + struct enetc_msg_swbd msg; /* Only valid for VSI */ }; #define ENETC_SI_ALIGN 32 diff --git a/drivers/net/ethernet/freescale/enetc/enetc_vf.c b/drivers/net/ethernet/freescale/enetc/enetc_vf.c index 6c4b374bcb0e..df8e95cc47d0 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_vf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_vf.c @@ -17,11 +17,36 @@ static void enetc_msg_vsi_write_msg(struct enetc_hw *hw, enetc_wr(hw, ENETC_VSIMSGSNDAR0, val); } +static void enetc_msg_dma_free(struct device *dev, struct enetc_msg_swbd *msg) +{ + if (msg->vaddr) { + dma_free_coherent(dev, msg->size, msg->vaddr, msg->dma); + msg->vaddr = NULL; + } +} + static int enetc_msg_vsi_send(struct enetc_si *si, struct enetc_msg_swbd *msg) { + struct device *dev = &si->pdev->dev; int timeout = 100; u32 vsimsgsr; + /* The VSI mailbox may be busy if last message was not yet processed + * by PSI. So need to check the mailbox status before sending. + */ + vsimsgsr = enetc_rd(&si->hw, ENETC_VSIMSGSR); + if (vsimsgsr & ENETC_VSIMSGSR_MB) { + /* It is safe to free the DMA buffer here, the caller does + * not access the DMA buffer if enetc_msg_vsi_send() fails. + */ + enetc_msg_dma_free(dev, msg); + dev_err(dev, "VSI mailbox is busy\n"); + return -EIO; + } + + /* Free the DMA buffer of the last message */ + enetc_msg_dma_free(dev, &si->msg); + si->msg = *msg; enetc_msg_vsi_write_msg(&si->hw, msg); do { @@ -32,12 +57,15 @@ static int enetc_msg_vsi_send(struct enetc_si *si, struct enetc_msg_swbd *msg) usleep_range(1000, 2000); } while (--timeout); - if (!timeout) + if (!timeout) { + dev_err(dev, "VSI mailbox timeout\n"); + return -ETIMEDOUT; + } /* check for message delivery error */ if (vsimsgsr & ENETC_VSIMSGSR_MS) { - dev_err(&si->pdev->dev, "VSI command execute error: %d\n", + dev_err(dev, "VSI command execute error: %d\n", ENETC_SIMSGSR_GET_MC(vsimsgsr)); return -EIO; } @@ -50,7 +78,6 @@ static int enetc_msg_vsi_set_primary_mac_addr(struct enetc_ndev_priv *priv, { struct enetc_msg_cmd_set_primary_mac *cmd; struct enetc_msg_swbd msg; - int err; msg.size = ALIGN(sizeof(struct enetc_msg_cmd_set_primary_mac), 64); msg.vaddr = dma_alloc_coherent(priv->dev, msg.size, &msg.dma, @@ -67,11 +94,7 @@ static int enetc_msg_vsi_set_primary_mac_addr(struct enetc_ndev_priv *priv, memcpy(&cmd->mac, saddr, sizeof(struct sockaddr)); /* send the command and wait */ - err = enetc_msg_vsi_send(priv->si, &msg); - - dma_free_coherent(priv->dev, msg.size, msg.vaddr, msg.dma); - - return err; + return enetc_msg_vsi_send(priv->si, &msg); } static int enetc_vf_set_mac_addr(struct net_device *ndev, void *addr) @@ -259,6 +282,7 @@ static void enetc_vf_remove(struct pci_dev *pdev) { struct enetc_si *si = pci_get_drvdata(pdev); struct enetc_ndev_priv *priv; + struct enetc_msg_swbd msg; priv = netdev_priv(si->ndev); unregister_netdev(si->ndev); @@ -270,7 +294,9 @@ static void enetc_vf_remove(struct pci_dev *pdev) free_netdev(si->ndev); + msg = si->msg; enetc_pci_remove(pdev); + enetc_msg_dma_free(&pdev->dev, &msg); } static const struct pci_device_id enetc_vf_id_table[] = { -- cgit v1.2.3 From 694de316f607fe2473d52ca0707e3918e72c1562 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Wed, 29 Apr 2026 16:37:42 +0800 Subject: net: libwx: fix VF illegal register access Register WX_CFG_PORT_ST is a PF restricted register. When a VF is initialized, attempting to read this register triggers an illegal register access, which lead to a system hang. When the device is VF, the bus function ID can be obtained directly from the PCI_FUNC(pdev->devfn). Fixes: a04ea57aae37 ("net: libwx: fix device bus LAN ID") Cc: stable@vger.kernel.org Signed-off-by: Jiawen Wu Link: https://patch.msgid.link/4D1F4452D21DE107+20260429083743.88961-1-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_hw.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c index d3772d01e00b..2451f6b20b11 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c @@ -2480,8 +2480,11 @@ int wx_sw_init(struct wx *wx) wx->oem_svid = pdev->subsystem_vendor; wx->oem_ssid = pdev->subsystem_device; wx->bus.device = PCI_SLOT(pdev->devfn); - wx->bus.func = FIELD_GET(WX_CFG_PORT_ST_LANID, - rd32(wx, WX_CFG_PORT_ST)); + if (pdev->is_virtfn) + wx->bus.func = PCI_FUNC(pdev->devfn); + else + wx->bus.func = FIELD_GET(WX_CFG_PORT_ST_LANID, + rd32(wx, WX_CFG_PORT_ST)); if (wx->oem_svid == PCI_VENDOR_ID_WANGXUN || pdev->is_virtfn) { -- cgit v1.2.3 From 7a33345153eeeda195c55f15be27074e4c3b5109 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Wed, 29 Apr 2026 16:37:43 +0800 Subject: net: libwx: use request_irq for VF misc interrupt Currently, request_threaded_irq() is used with a primary handler but a NULL threaded handler, while also setting the IRQF_ONESHOT flag. This specific combination triggers a WARNING since the commit aef30c8d569c ("genirq: Warn about using IRQF_ONESHOT without a threaded handler"). WARNING: kernel/irq/manage.c:1502 at __setup_irq+0x4fa/0x760 Fix the issue by switching to request_irq(), which is the appropriate interface or a non-threaded interrupt handler, and removing the unnecessary IRQF_ONESHOT flag. Fixes: eb4898fde1de ("net: libwx: add wangxun vf common api") Cc: stable@vger.kernel.org Signed-off-by: Jiawen Wu Link: https://patch.msgid.link/786DDC7D5CCA6D0A+20260429083743.88961-2-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_vf_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_vf_common.c b/drivers/net/ethernet/wangxun/libwx/wx_vf_common.c index 29cdbed2e5ec..94ff8f5f0b4c 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_vf_common.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_vf_common.c @@ -99,8 +99,8 @@ int wx_request_msix_irqs_vf(struct wx *wx) } } - err = request_threaded_irq(wx->msix_entry->vector, wx_msix_misc_vf, - NULL, IRQF_ONESHOT, netdev->name, wx); + err = request_irq(wx->msix_entry->vector, wx_msix_misc_vf, + 0, netdev->name, wx); if (err) { wx_err(wx, "request_irq for msix_other failed: %d\n", err); goto free_queue_irqs; -- cgit v1.2.3 From 75df490c9e8457990c8b227650f6491218ce018b Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 29 Apr 2026 14:02:31 +0200 Subject: net: airoha: Move entries to queue head in case of DMA mapping failure in airoha_dev_xmit() In order to respect the original descriptor order and avoid any potential IOMMU fault or memory corruption, move pending queue entries to the head of hw queue tx_list if the DMA mapping of current inflight packet fails in airoha_dev_xmit routine. Fixes: 3f47e67dff1f7 ("net: airoha: Add the capability to consume out-of-order DMA tx descriptors") Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20260429-airoha-xmit-unmap-error-path-v2-1-32e43b7c6d25@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index f8b3d53bccad..d0c0c0ec8a80 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -2120,14 +2120,12 @@ static netdev_tx_t airoha_dev_xmit(struct sk_buff *skb, return NETDEV_TX_OK; error_unmap: - while (!list_empty(&tx_list)) { - e = list_first_entry(&tx_list, struct airoha_queue_entry, - list); + list_for_each_entry(e, &tx_list, list) { dma_unmap_single(dev->dev.parent, e->dma_addr, e->dma_len, DMA_TO_DEVICE); e->dma_addr = 0; - list_move_tail(&e->list, &q->tx_list); } + list_splice(&tx_list, &q->tx_list); spin_unlock_bh(&q->lock); error: -- cgit v1.2.3 From aaadccde312f1f6c752461e015adcaa25d463cbc Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Wed, 29 Apr 2026 07:57:13 +0530 Subject: octeontx2-af: npc: cn20k: Propagate MCAM key-type errors on cn20k npc_mcam_idx_2_key_type() can fail; callers used to ignore it and still used kw_type when enabling, configuring, copying, and reading MCAM entries. That could program or decode hardware with an undefined key type. Return -EINVAL when key-type lookup fails. Return -EINVAL from npc_cn20k_copy_mcam_entry() when src and dest key types differ instead of failing silently. Change npc_cn20k_{enable,config,copy,read}_mcam_entry() to return int on success or error. Thread those errors through the cn20k MCAM write and read mbox handlers, the cn20k baseline steer read path, NPC defrag move (disable/copy/enable with dev_err and -EFAULT), and the DMAC update path in rvu_npc_fs.c. Make npc_copy_mcam_entry() return int so the cn20k branch can return npc_cn20k_copy_mcam_entry() without a void/int mismatch, and fail NPC_MCAM_SHIFT_ENTRY when copy fails. Cc: Suman Ghosh Cc: Dan Carpenter Fixes: 6d1e70282f76 ("octeontx2-af: npc: cn20k: Use common APIs") Link: https://lore.kernel.org/netdev/adiQJvuKlEhq2ILx@stanley.mountain/ Signed-off-by: Ratheesh Kannoth Link: https://patch.msgid.link/20260429022722.1110289-2-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/marvell/octeontx2/af/cn20k/npc.c | 122 +++++++++++++++------ .../net/ethernet/marvell/octeontx2/af/cn20k/npc.h | 20 ++-- .../net/ethernet/marvell/octeontx2/af/rvu_npc.c | 18 ++- .../net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c | 20 ++-- 4 files changed, 124 insertions(+), 56 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c index 7291fdb89b03..7170dcf26200 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c @@ -798,7 +798,7 @@ program_mkex_extr: iounmap(mkex_prfl_addr); } -void +int npc_cn20k_enable_mcam_entry(struct rvu *rvu, int blkaddr, int index, bool enable) { @@ -808,7 +808,9 @@ npc_cn20k_enable_mcam_entry(struct rvu *rvu, int blkaddr, u64 cfg, hw_prio; u8 kw_type; - npc_mcam_idx_2_key_type(rvu, index, &kw_type); + if (npc_mcam_idx_2_key_type(rvu, index, &kw_type)) + return -EINVAL; + if (kw_type == NPC_MCAM_KEY_X2) { cfg = rvu_read64(rvu, blkaddr, NPC_AF_CN20K_MCAMEX_BANKX_CFG_EXT(mcam_idx, @@ -819,7 +821,7 @@ npc_cn20k_enable_mcam_entry(struct rvu *rvu, int blkaddr, rvu_write64(rvu, blkaddr, NPC_AF_CN20K_MCAMEX_BANKX_CFG_EXT(mcam_idx, bank), cfg); - return; + return 0; } /* For NPC_CN20K_MCAM_KEY_X4 keys, both the banks @@ -836,6 +838,8 @@ npc_cn20k_enable_mcam_entry(struct rvu *rvu, int blkaddr, NPC_AF_CN20K_MCAMEX_BANKX_CFG_EXT(mcam_idx, bank), cfg); } + + return 0; } void @@ -1042,9 +1046,9 @@ npc_cn20k_set_mcam_bank_cfg(struct rvu *rvu, int blkaddr, int mcam_idx, } } -void npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, - u8 intf, struct cn20k_mcam_entry *entry, - bool enable, u8 hw_prio, u8 req_kw_type) +int npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, + u8 intf, struct cn20k_mcam_entry *entry, + bool enable, u8 hw_prio, u8 req_kw_type) { struct npc_mcam *mcam = &rvu->hw->mcam; int mcam_idx = index % mcam->banksize; @@ -1052,10 +1056,13 @@ void npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, int kw = 0; u8 kw_type; + if (npc_mcam_idx_2_key_type(rvu, index, &kw_type)) + return -EINVAL; + /* Disable before mcam entry update */ - npc_cn20k_enable_mcam_entry(rvu, blkaddr, index, false); + if (npc_cn20k_enable_mcam_entry(rvu, blkaddr, index, false)) + return -EINVAL; - npc_mcam_idx_2_key_type(rvu, index, &kw_type); /* CAM1 takes the comparison value and * CAM0 specifies match for a bit in key being '0' or '1' or 'dontcare'. * CAM1 = 0 & CAM0 = 1 => match if key = 0 @@ -1120,9 +1127,11 @@ set_cfg: /* PF installing VF rule */ npc_cn20k_set_mcam_bank_cfg(rvu, blkaddr, mcam_idx, bank, kw_type, enable, hw_prio); + + return 0; } -void npc_cn20k_copy_mcam_entry(struct rvu *rvu, int blkaddr, u16 src, u16 dest) +int npc_cn20k_copy_mcam_entry(struct rvu *rvu, int blkaddr, u16 src, u16 dest) { struct npc_mcam *mcam = &rvu->hw->mcam; u64 cfg, sreg, dreg, soff, doff; @@ -1132,10 +1141,15 @@ void npc_cn20k_copy_mcam_entry(struct rvu *rvu, int blkaddr, u16 src, u16 dest) dbank = npc_get_bank(mcam, dest); sbank = npc_get_bank(mcam, src); - npc_mcam_idx_2_key_type(rvu, src, &src_kwtype); - npc_mcam_idx_2_key_type(rvu, dest, &dest_kwtype); + + if (npc_mcam_idx_2_key_type(rvu, src, &src_kwtype)) + return -EINVAL; + + if (npc_mcam_idx_2_key_type(rvu, dest, &dest_kwtype)) + return -EINVAL; + if (src_kwtype != dest_kwtype) - return; + return -EINVAL; src &= (mcam->banksize - 1); dest &= (mcam->banksize - 1); @@ -1170,6 +1184,8 @@ void npc_cn20k_copy_mcam_entry(struct rvu *rvu, int blkaddr, u16 src, u16 dest) if (src_kwtype == NPC_MCAM_KEY_X2) break; } + + return 0; } static void npc_cn20k_fill_entryword(struct cn20k_mcam_entry *entry, int idx, @@ -1179,16 +1195,17 @@ static void npc_cn20k_fill_entryword(struct cn20k_mcam_entry *entry, int idx, entry->kw_mask[idx] = cam1 ^ cam0; } -void npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index, - struct cn20k_mcam_entry *entry, - u8 *intf, u8 *ena, u8 *hw_prio) +int npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index, + struct cn20k_mcam_entry *entry, + u8 *intf, u8 *ena, u8 *hw_prio) { struct npc_mcam *mcam = &rvu->hw->mcam; u64 cam0, cam1, bank_cfg, cfg; int kw = 0, bank; u8 kw_type; - npc_mcam_idx_2_key_type(rvu, index, &kw_type); + if (npc_mcam_idx_2_key_type(rvu, index, &kw_type)) + return -EINVAL; bank = npc_get_bank(mcam, index); index &= (mcam->banksize - 1); @@ -1298,6 +1315,8 @@ read_action: cfg = rvu_read64(rvu, blkaddr, NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(index, 0, 1)); entry->vtag_action = cfg; + + return 0; } int rvu_mbox_handler_npc_cn20k_mcam_write_entry(struct rvu *rvu, @@ -1335,11 +1354,10 @@ int rvu_mbox_handler_npc_cn20k_mcam_write_entry(struct rvu *rvu, if (is_pffunc_af(req->hdr.pcifunc)) nix_intf = req->intf; - npc_cn20k_config_mcam_entry(rvu, blkaddr, req->entry, nix_intf, - &req->entry_data, req->enable_entry, - req->hw_prio, req->req_kw_type); + rc = npc_cn20k_config_mcam_entry(rvu, blkaddr, req->entry, nix_intf, + &req->entry_data, req->enable_entry, + req->hw_prio, req->req_kw_type); - rc = 0; exit: mutex_unlock(&mcam->lock); return rc; @@ -1361,11 +1379,13 @@ int rvu_mbox_handler_npc_cn20k_mcam_read_entry(struct rvu *rvu, mutex_lock(&mcam->lock); rc = npc_mcam_verify_entry(mcam, pcifunc, req->entry); - if (!rc) - npc_cn20k_read_mcam_entry(rvu, blkaddr, req->entry, - &rsp->entry_data, &rsp->intf, - &rsp->enable, &rsp->hw_prio); + if (rc) + goto fail; + rc = npc_cn20k_read_mcam_entry(rvu, blkaddr, req->entry, + &rsp->entry_data, &rsp->intf, + &rsp->enable, &rsp->hw_prio); +fail: mutex_unlock(&mcam->lock); return rc; } @@ -1375,11 +1395,13 @@ int rvu_mbox_handler_npc_cn20k_mcam_alloc_and_write_entry(struct rvu *rvu, struct npc_mcam_alloc_and_write_entry_rsp *rsp) { struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, req->hdr.pcifunc); + struct npc_mcam_free_entry_req free_req = { 0 }; struct npc_mcam_alloc_entry_req entry_req; struct npc_mcam_alloc_entry_rsp entry_rsp; struct npc_mcam *mcam = &rvu->hw->mcam; u16 entry = NPC_MCAM_ENTRY_INVALID; - int blkaddr, rc; + struct msg_rsp free_rsp; + int blkaddr, rc, err; u8 nix_intf; blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0); @@ -1415,12 +1437,23 @@ int rvu_mbox_handler_npc_cn20k_mcam_alloc_and_write_entry(struct rvu *rvu, else nix_intf = pfvf->nix_rx_intf; - npc_cn20k_config_mcam_entry(rvu, blkaddr, entry, nix_intf, - &req->entry_data, req->enable_entry, - req->hw_prio, req->req_kw_type); + rc = npc_cn20k_config_mcam_entry(rvu, blkaddr, entry, nix_intf, + &req->entry_data, req->enable_entry, + req->hw_prio, req->req_kw_type); mutex_unlock(&mcam->lock); + if (rc) { + free_req.hdr.pcifunc = req->hdr.pcifunc; + free_req.entry = entry_rsp.entry; + err = rvu_mbox_handler_npc_mcam_free_entry(rvu, &free_req, &free_rsp); + if (err) + dev_err(rvu->dev, + "%s: Error to free mcam idx %u\n", + __func__, entry_rsp.entry); + return rc; + } + rsp->entry = entry_rsp.entry; return 0; } @@ -1480,9 +1513,9 @@ int rvu_mbox_handler_npc_cn20k_read_base_steer_rule(struct rvu *rvu, read_entry: /* Read the mcam entry */ - npc_cn20k_read_mcam_entry(rvu, blkaddr, index, - &rsp->entry, &intf, - &enable, &hw_prio); + rc = npc_cn20k_read_mcam_entry(rvu, blkaddr, index, + &rsp->entry, &intf, + &enable, &hw_prio); mutex_unlock(&mcam->lock); out: return rc; @@ -3607,9 +3640,30 @@ int npc_defrag_move_vdx_to_free(struct rvu *rvu, NPC_AF_CN20K_MCAMEX_BANKX_STAT_EXT(midx, bank)); - npc_cn20k_enable_mcam_entry(rvu, blkaddr, old_midx, false); - npc_cn20k_copy_mcam_entry(rvu, blkaddr, old_midx, new_midx); - npc_cn20k_enable_mcam_entry(rvu, blkaddr, new_midx, true); + /* If bug happened during copy/enable mcam, then there is a bug in allocation + * algorithm itself. There is no point in rewinding and returning, as it + * will face further issue. Return error after printing error + */ + if (npc_cn20k_enable_mcam_entry(rvu, blkaddr, old_midx, false)) { + dev_err(rvu->dev, + "%s: Error happened while disabling old_mid=%u\n", + __func__, old_midx); + return -EFAULT; + } + + if (npc_cn20k_copy_mcam_entry(rvu, blkaddr, old_midx, new_midx)) { + dev_err(rvu->dev, + "%s: Error happened while copying old_midx=%u new_midx=%u\n", + __func__, old_midx, new_midx); + return -EFAULT; + } + + if (npc_cn20k_enable_mcam_entry(rvu, blkaddr, new_midx, true)) { + dev_err(rvu->dev, + "%s: Error happened while enabling new_mid=%u\n", + __func__, new_midx); + return -EFAULT; + } midx = new_midx % mcam->banksize; bank = new_midx / mcam->banksize; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h index 815d0b257a7e..8f3eea9cfb1d 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h @@ -320,16 +320,16 @@ void npc_cn20k_dft_rules_free(struct rvu *rvu, u16 pcifunc); int npc_cn20k_dft_rules_idx_get(struct rvu *rvu, u16 pcifunc, u16 *bcast, u16 *mcast, u16 *promisc, u16 *ucast); -void npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, - u8 intf, struct cn20k_mcam_entry *entry, - bool enable, u8 hw_prio, u8 req_kw_type); -void npc_cn20k_enable_mcam_entry(struct rvu *rvu, int blkaddr, - int index, bool enable); -void npc_cn20k_copy_mcam_entry(struct rvu *rvu, int blkaddr, - u16 src, u16 dest); -void npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index, - struct cn20k_mcam_entry *entry, u8 *intf, - u8 *ena, u8 *hw_prio); +int npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, + u8 intf, struct cn20k_mcam_entry *entry, + bool enable, u8 hw_prio, u8 req_kw_type); +int npc_cn20k_enable_mcam_entry(struct rvu *rvu, int blkaddr, + int index, bool enable); +int npc_cn20k_copy_mcam_entry(struct rvu *rvu, int blkaddr, + u16 src, u16 dest); +int npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index, + struct cn20k_mcam_entry *entry, u8 *intf, + u8 *ena, u8 *hw_prio); void npc_cn20k_clear_mcam_entry(struct rvu *rvu, int blkaddr, int bank, int index); int npc_mcam_idx_2_key_type(struct rvu *rvu, u16 mcam_idx, u8 *key_type); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index c2ca5ed1d028..ecaf0946b852 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -241,7 +241,10 @@ void npc_enable_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam, if (index < 0 || index >= mcam->banksize * mcam->banks) return; - return npc_cn20k_enable_mcam_entry(rvu, blkaddr, index, enable); + if (npc_cn20k_enable_mcam_entry(rvu, blkaddr, index, enable)) + dev_err(rvu->dev, "Error to %s mcam %u entry\n", + enable ? "enable" : "disable", index); + return; } index &= (mcam->banksize - 1); @@ -589,8 +592,8 @@ void npc_read_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam, NPC_AF_MCAMEX_BANKX_CFG(src, sbank)) & 1; } -static void npc_copy_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam, - int blkaddr, u16 src, u16 dest) +static int npc_copy_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam, + int blkaddr, u16 src, u16 dest) { int dbank = npc_get_bank(mcam, dest); int sbank = npc_get_bank(mcam, src); @@ -630,6 +633,7 @@ static void npc_copy_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam, NPC_AF_MCAMEX_BANKX_CFG(src, sbank)); rvu_write64(rvu, blkaddr, NPC_AF_MCAMEX_BANKX_CFG(dest, dbank), cfg); + return 0; } u64 npc_get_mcam_action(struct rvu *rvu, struct npc_mcam *mcam, @@ -3266,7 +3270,10 @@ int rvu_mbox_handler_npc_mcam_shift_entry(struct rvu *rvu, npc_enable_mcam_entry(rvu, mcam, blkaddr, new_entry, false); /* Copy rule from old entry to new entry */ - npc_copy_mcam_entry(rvu, mcam, blkaddr, old_entry, new_entry); + if (npc_copy_mcam_entry(rvu, mcam, blkaddr, old_entry, new_entry)) { + rc = NPC_MCAM_INVALID_REQ; + break; + } /* Copy counter mapping, if any */ cntr = mcam->entry2cntr_map[old_entry]; @@ -3284,7 +3291,8 @@ int rvu_mbox_handler_npc_mcam_shift_entry(struct rvu *rvu, /* If shift has failed then report the failed index */ if (index != req->shift_count) { - rc = NPC_MCAM_PERM_DENIED; + if (!rc) + rc = NPC_MCAM_PERM_DENIED; rsp->failed_entry_idx = index; } diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c index b45798d9fdab..fe10554b1f0e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c @@ -1980,13 +1980,15 @@ static int npc_update_dmac_value(struct rvu *rvu, int npcblkaddr, ether_addr_copy(rule->packet.dmac, pfvf->mac_addr); - if (is_cn20k(rvu->pdev)) - npc_cn20k_read_mcam_entry(rvu, npcblkaddr, rule->entry, - cn20k_entry, &intf, - &enable, &hw_prio); - else + if (is_cn20k(rvu->pdev)) { + if (npc_cn20k_read_mcam_entry(rvu, npcblkaddr, rule->entry, + cn20k_entry, &intf, + &enable, &hw_prio)) + return -EINVAL; + } else { npc_read_mcam_entry(rvu, mcam, npcblkaddr, rule->entry, entry, &intf, &enable); + } npc_update_entry(rvu, NPC_DMAC, &mdata, ether_addr_to_u64(pfvf->mac_addr), 0, @@ -2038,8 +2040,12 @@ void npc_mcam_enable_flows(struct rvu *rvu, u16 target) continue; } - if (rule->vfvlan_cfg) - npc_update_dmac_value(rvu, blkaddr, rule, pfvf); + if (rule->vfvlan_cfg) { + if (npc_update_dmac_value(rvu, blkaddr, rule, pfvf)) + dev_err(rvu->dev, + "Update dmac failed for %u, target=%#x\n", + rule->entry, target); + } if (rule->rx_action.op == NIX_RX_ACTION_DEFAULT) { if (!def_ucast_rule) -- cgit v1.2.3 From 1100af13fd14b523f1b0634c14be497b41c78958 Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Wed, 29 Apr 2026 07:57:14 +0530 Subject: octeontx2-af: npc: cn20k: Drop debugfs_create_file() error checks in init debugfs is not intended to be checked for allocation failures the way other kernel APIs are: callers should not fail probe or subsystem init because a debugfs node could not be created, including when debugfs is disabled in Kconfig. Replacing NULL checks with IS_ERR() checks is similarly wrong for optional debugfs. Remove dentry checks and -EFAULT returns from npc_cn20k_debugfs_init(). See: https://staticthinking.wordpress.com/2023/07/24/ debugfs-functions-are-not-supposed-to-be-checked/ Cc: Dan Carpenter Fixes: 528530dff56b ("octeontx2-af: npc: cn20k: add debugfs support") Link: https://lore.kernel.org/netdev/adjNGPWKMOk3KgWL@stanley.mountain/ Reviewed-by: Simon Horman Signed-off-by: Ratheesh Kannoth Link: https://patch.msgid.link/20260429022722.1110289-3-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- .../ethernet/marvell/octeontx2/af/cn20k/debugfs.c | 33 +++++++--------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/debugfs.c index 3debf2fae1a4..6f13296303cb 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/debugfs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/debugfs.c @@ -249,34 +249,21 @@ DEFINE_SHOW_ATTRIBUTE(npc_defrag); int npc_cn20k_debugfs_init(struct rvu *rvu) { struct npc_priv_t *npc_priv = npc_priv_get(); - struct dentry *npc_dentry; - npc_dentry = debugfs_create_file("mcam_layout", 0444, rvu->rvu_dbg.npc, - npc_priv, &npc_mcam_layout_fops); + debugfs_create_file("mcam_layout", 0444, rvu->rvu_dbg.npc, + npc_priv, &npc_mcam_layout_fops); - if (!npc_dentry) - return -EFAULT; + debugfs_create_file("mcam_default", 0444, rvu->rvu_dbg.npc, + rvu, &npc_mcam_default_fops); - npc_dentry = debugfs_create_file("mcam_default", 0444, rvu->rvu_dbg.npc, - rvu, &npc_mcam_default_fops); + debugfs_create_file("vidx2idx", 0444, rvu->rvu_dbg.npc, + npc_priv, &npc_vidx2idx_map_fops); - if (!npc_dentry) - return -EFAULT; + debugfs_create_file("idx2vidx", 0444, rvu->rvu_dbg.npc, + npc_priv, &npc_idx2vidx_map_fops); - npc_dentry = debugfs_create_file("vidx2idx", 0444, rvu->rvu_dbg.npc, - npc_priv, &npc_vidx2idx_map_fops); - if (!npc_dentry) - return -EFAULT; - - npc_dentry = debugfs_create_file("idx2vidx", 0444, rvu->rvu_dbg.npc, - npc_priv, &npc_idx2vidx_map_fops); - if (!npc_dentry) - return -EFAULT; - - npc_dentry = debugfs_create_file("defrag", 0444, rvu->rvu_dbg.npc, - npc_priv, &npc_defrag_fops); - if (!npc_dentry) - return -EFAULT; + debugfs_create_file("defrag", 0444, rvu->rvu_dbg.npc, + npc_priv, &npc_defrag_fops); return 0; } -- cgit v1.2.3 From adb5ff41efbc0a9d86fabf880076973379db6e49 Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Wed, 29 Apr 2026 07:57:15 +0530 Subject: octeontx2-af: npc: cn20k: Propagate errors in defrag MCAM alloc rollback npc_defrag_alloc_free_slots() allocates MCAM indexes in up to two passes on bank0 then bank1. On failure it rolls back by freeing entries already placed in save[]. __npc_subbank_alloc() can return a negative errno while only part of the indexes are valid. The rollback loop used rc for npc_mcam_idx_2_subbank_idx() as well, so a successful lookup stored zero in rc and a later __npc_subbank_free() failure could still end with return 0 when the allocation path had also left rc at zero (for example shortfall after zero return values from the alloc helpers). Jump to the rollback path immediately when either __npc_subbank_alloc() call fails, preserving its errno. If both calls succeed but the total allocated count is still less than cnt, set rc to -ENOSPC before rollback. Use a separate err variable for npc_mcam_idx_2_subbank_idx() so a successful lookup no longer clears a non-zero rc from the allocation phase. Cc: Dan Carpenter Fixes: 645c6e3c1999 ("octeontx2-af: npc: cn20k: virtual index support") Link: https://lore.kernel.org/netdev/adjNJEpILRZATB2N@stanley.mountain/ Reviewed-by: Simon Horman Signed-off-by: Ratheesh Kannoth Link: https://patch.msgid.link/20260429022722.1110289-4-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c index 7170dcf26200..87da43088b67 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c @@ -2338,6 +2338,7 @@ err2: __npc_subbank_mark_free(rvu, sb); err1: kfree(save); + *alloc_cnt = 0; return rc; } @@ -3515,7 +3516,7 @@ static int npc_defrag_alloc_free_slots(struct rvu *rvu, { int alloc_cnt1, alloc_cnt2; struct npc_subbank *sb; - int rc, sb_off, i; + int rc, sb_off, i, err; bool deleted; sb = &npc_priv.sb[f->idx]; @@ -3529,6 +3530,7 @@ static int npc_defrag_alloc_free_slots(struct rvu *rvu, NPC_MCAM_LOWER_PRIO, false, cnt, save, cnt, true, &alloc_cnt1); + if (alloc_cnt1 < cnt) { rc = __npc_subbank_alloc(rvu, sb, NPC_MCAM_KEY_X2, sb->b1b, @@ -3544,15 +3546,17 @@ static int npc_defrag_alloc_free_slots(struct rvu *rvu, dev_err(rvu->dev, "%s: Failed to alloc cnt=%u alloc_cnt1=%u alloc_cnt2=%u\n", __func__, cnt, alloc_cnt1, alloc_cnt2); + rc = -ENOSPC; goto fail_free_alloc; } + return 0; fail_free_alloc: for (i = 0; i < alloc_cnt1 + alloc_cnt2; i++) { - rc = npc_mcam_idx_2_subbank_idx(rvu, save[i], - &sb, &sb_off); - if (rc) { + err = npc_mcam_idx_2_subbank_idx(rvu, save[i], + &sb, &sb_off); + if (err) { dev_err(rvu->dev, "%s: Error to find subbank for mcam idx=%u\n", __func__, save[i]); -- cgit v1.2.3 From d7e5940c4c508df73b15d9bc29628a83b3674fff Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Wed, 29 Apr 2026 07:57:16 +0530 Subject: octeontx2-af: npc: cn20k: Fix target map and rule npc_defrag_move_vdx_to_free() disables, copies, and enables the MCAM entry at a new index but previously left entry2target_pffunc[] and the mcam_rules list still keyed to the old index. Copy the target PF association to the new slot, clear the old one, and retarget the rule entry so software state matches the relocated hardware context. Fixes: 645c6e3c1999 ("octeontx2-af: npc: cn20k: virtual index support") Signed-off-by: Ratheesh Kannoth Link: https://patch.msgid.link/20260429022722.1110289-5-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c index 87da43088b67..70ce3f49adc1 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c @@ -3602,9 +3602,10 @@ int npc_defrag_move_vdx_to_free(struct rvu *rvu, struct npc_defrag_node *v, int cnt, u16 *save) { + u16 new_midx, old_midx, vidx, target_pf; struct npc_mcam *mcam = &rvu->hw->mcam; + struct rvu_npc_mcam_rule *rule, *tmp; int i, vidx_cnt, rc, sb_off; - u16 new_midx, old_midx, vidx; struct npc_subbank *sb; bool deleted; u16 pcifunc; @@ -3723,8 +3724,21 @@ int npc_defrag_move_vdx_to_free(struct rvu *rvu, mcam->entry2pfvf_map[new_midx] = pcifunc; /* Counter is not preserved */ mcam->entry2cntr_map[new_midx] = new_midx; + target_pf = mcam->entry2target_pffunc[old_midx]; + mcam->entry2target_pffunc[new_midx] = target_pf; + mcam->entry2target_pffunc[old_midx] = NPC_MCAM_INVALID_MAP; + npc_mcam_set_bit(mcam, new_midx); + /* Note: list order is not functionally required for mcam_rules */ + list_for_each_entry_safe(rule, tmp, &mcam->mcam_rules, list) { + if (rule->entry != old_midx) + continue; + + rule->entry = new_midx; + break; + } + /* Mark as invalid */ v->vidx[vidx_cnt - i - 1] = -1; save[cnt - i - 1] = -1; -- cgit v1.2.3 From d2dabf09632c84b7acdc0fb2eeb6b6fe9c0f9106 Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Wed, 29 Apr 2026 07:57:17 +0530 Subject: octeontx2-af: npc: cn20k: Clear MCAM entries by index and key width Replace the old four-argument CN20K MCAM clear with a per-bank static helper and npc_cn20k_clear_mcam_entry() that takes a logical MCAM index, resolves the key width via npc_mcam_idx_2_key_type(), and clears either one bank (X2) or every bank (X4). Call it from npc_clear_mcam_entry() on cn20k and log when key-type lookup fails. Use the per-bank helper from npc_cn20k_config_mcam_entry() for pre-program clears. For loopback VFs, use the promisc MCAM index as ucast_idx when copying RSS action for promisc, matching cn20k default-rule layout. Cc: Suman Ghosh Fixes: 6d1e70282f76 ("octeontx2-af: npc: cn20k: Use common APIs") Signed-off-by: Ratheesh Kannoth Link: https://patch.msgid.link/20260429022722.1110289-6-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/marvell/octeontx2/af/cn20k/npc.c | 37 +++++++++++++++++++--- .../net/ethernet/marvell/octeontx2/af/cn20k/npc.h | 3 +- .../net/ethernet/marvell/octeontx2/af/rvu_npc.c | 17 ++++++++-- 3 files changed, 48 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c index 70ce3f49adc1..112c37c190b1 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c @@ -842,8 +842,8 @@ npc_cn20k_enable_mcam_entry(struct rvu *rvu, int blkaddr, return 0; } -void -npc_cn20k_clear_mcam_entry(struct rvu *rvu, int blkaddr, int bank, int index) +static void +npc_clear_x2_entry(struct rvu *rvu, int blkaddr, int bank, int index) { rvu_write64(rvu, blkaddr, NPC_AF_CN20K_MCAMEX_BANKX_CAMX_INTF_EXT(index, bank, 1), @@ -877,6 +877,33 @@ npc_cn20k_clear_mcam_entry(struct rvu *rvu, int blkaddr, int bank, int index) NPC_AF_CN20K_MCAMEX_BANKX_STAT_EXT(index, bank), 0); } +int +npc_cn20k_clear_mcam_entry(struct rvu *rvu, int blkaddr, int mcam_idx) +{ + struct npc_mcam *mcam = &rvu->hw->mcam; + int bank = npc_get_bank(mcam, mcam_idx); + u8 kw_type; + int index; + + if (npc_mcam_idx_2_key_type(rvu, mcam_idx, &kw_type)) + return -EINVAL; + + index = mcam_idx & (mcam->banksize - 1); + + if (kw_type == NPC_MCAM_KEY_X2) { + npc_clear_x2_entry(rvu, blkaddr, bank, index); + return 0; + } + + /* For NPC_MCAM_KEY_X4 keys, both the banks + * need to be programmed with the same value. + */ + for (bank = 0; bank < mcam->banks_per_entry; bank++) + npc_clear_x2_entry(rvu, blkaddr, bank, index); + + return 0; +} + static void npc_cn20k_get_keyword(struct cn20k_mcam_entry *entry, int idx, u64 *cam0, u64 *cam1) { @@ -1071,7 +1098,7 @@ int npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, */ if (kw_type == NPC_MCAM_KEY_X2) { /* Clear mcam entry to avoid writes being suppressed by NPC */ - npc_cn20k_clear_mcam_entry(rvu, blkaddr, bank, mcam_idx); + npc_clear_x2_entry(rvu, blkaddr, bank, mcam_idx); npc_cn20k_config_kw_x2(rvu, mcam, blkaddr, mcam_idx, intf, entry, bank, kw_type, kw, req_kw_type); @@ -1096,8 +1123,8 @@ int npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, } /* Clear mcam entry to avoid writes being suppressed by NPC */ - npc_cn20k_clear_mcam_entry(rvu, blkaddr, 0, mcam_idx); - npc_cn20k_clear_mcam_entry(rvu, blkaddr, 1, mcam_idx); + npc_clear_x2_entry(rvu, blkaddr, 0, mcam_idx); + npc_clear_x2_entry(rvu, blkaddr, 1, mcam_idx); npc_cn20k_config_kw_x4(rvu, mcam, blkaddr, mcam_idx, intf, entry, diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h index 8f3eea9cfb1d..2f761b97f91b 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h @@ -330,8 +330,7 @@ int npc_cn20k_copy_mcam_entry(struct rvu *rvu, int blkaddr, int npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index, struct cn20k_mcam_entry *entry, u8 *intf, u8 *ena, u8 *hw_prio); -void npc_cn20k_clear_mcam_entry(struct rvu *rvu, int blkaddr, - int bank, int index); +int npc_cn20k_clear_mcam_entry(struct rvu *rvu, int blkaddr, int index); int npc_mcam_idx_2_key_type(struct rvu *rvu, u16 mcam_idx, u8 *key_type); u16 npc_cn20k_vidx2idx(u16 index); u16 npc_cn20k_idx2vidx(u16 idx); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index ecaf0946b852..44ca65efc80f 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -261,6 +261,13 @@ static void npc_clear_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam, int bank = npc_get_bank(mcam, index); int actbank = bank; + if (is_cn20k(rvu->pdev)) { + if (npc_cn20k_clear_mcam_entry(rvu, blkaddr, index)) + dev_err(rvu->dev, "%s Failed to clear mcam %u\n", + __func__, index); + return; + } + index &= (mcam->banksize - 1); for (; bank < (actbank + mcam->banks_per_entry); bank++) { rvu_write64(rvu, blkaddr, @@ -755,9 +762,15 @@ void rvu_npc_install_promisc_entry(struct rvu *rvu, u16 pcifunc, /* If the corresponding PF's ucast action is RSS, * use the same action for promisc also + * Please note that for lbk(s) "index" and "ucast_idx" + * will be same. */ - ucast_idx = npc_get_nixlf_mcam_index(mcam, pcifunc, - nixlf, NIXLF_UCAST_ENTRY); + if (is_lbk_vf(rvu, pcifunc)) + ucast_idx = index; + else + ucast_idx = npc_get_nixlf_mcam_index(mcam, pcifunc, + nixlf, NIXLF_UCAST_ENTRY); + if (is_mcam_entry_enabled(rvu, mcam, blkaddr, ucast_idx)) *(u64 *)&action = npc_get_mcam_action(rvu, mcam, blkaddr, ucast_idx); -- cgit v1.2.3 From 2b6d6bb7282c34dd8c04ee782393231acf5a26e2 Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Wed, 29 Apr 2026 07:57:18 +0530 Subject: octeontx2-af: npc: cn20k: Fix bank value For X4 keys its loop reused the bank parameter as the loop counter, so bank no longer reflected the caller's bank after the loop and the control flow was hard to follow. Program NPC_AF_CN20K_MCAMEX_BANKX_CFG_EXT directly in npc_cn20k_config_mcam_entry(): one CFG write for X2 using the computed bank, and one CFG write per bank inside the X4 action loop. Enable the entry at the end with npc_cn20k_enable_mcam_entry(..., true) instead of embedding the enable bit in bank_cfg via the removed helper. Cc: Suman Ghosh Fixes: 4e527f1e5c15 ("octeontx2-af: npc: cn20k: Add new mailboxes for CN20K silicon") Signed-off-by: Ratheesh Kannoth Link: https://patch.msgid.link/20260429022722.1110289-7-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/marvell/octeontx2/af/cn20k/npc.c | 92 +++++++++------------- 1 file changed, 37 insertions(+), 55 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c index 112c37c190b1..4773277fd409 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c @@ -1045,34 +1045,6 @@ static void npc_cn20k_config_kw_x4(struct rvu *rvu, struct npc_mcam *mcam, kw, req_kw_type); } -static void -npc_cn20k_set_mcam_bank_cfg(struct rvu *rvu, int blkaddr, int mcam_idx, - int bank, u8 kw_type, bool enable, u8 hw_prio) -{ - struct npc_mcam *mcam = &rvu->hw->mcam; - u64 bank_cfg; - - bank_cfg = (u64)hw_prio << 24; - if (enable) - bank_cfg |= 0x1; - - if (kw_type == NPC_MCAM_KEY_X2) { - rvu_write64(rvu, blkaddr, - NPC_AF_CN20K_MCAMEX_BANKX_CFG_EXT(mcam_idx, bank), - bank_cfg); - return; - } - - /* For NPC_MCAM_KEY_X4 keys, both the banks - * need to be programmed with the same value. - */ - for (bank = 0; bank < mcam->banks_per_entry; bank++) { - rvu_write64(rvu, blkaddr, - NPC_AF_CN20K_MCAMEX_BANKX_CFG_EXT(mcam_idx, bank), - bank_cfg); - } -} - int npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, u8 intf, struct cn20k_mcam_entry *entry, bool enable, u8 hw_prio, u8 req_kw_type) @@ -1080,6 +1052,7 @@ int npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, struct npc_mcam *mcam = &rvu->hw->mcam; int mcam_idx = index % mcam->banksize; int bank = index / mcam->banksize; + u64 bank_cfg = (u64)hw_prio << 24; int kw = 0; u8 kw_type; @@ -1119,41 +1092,50 @@ int npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(mcam_idx, bank, 1), entry->vtag_action); - goto set_cfg; - } - /* Clear mcam entry to avoid writes being suppressed by NPC */ - npc_clear_x2_entry(rvu, blkaddr, 0, mcam_idx); - npc_clear_x2_entry(rvu, blkaddr, 1, mcam_idx); - - npc_cn20k_config_kw_x4(rvu, mcam, blkaddr, - mcam_idx, intf, entry, - kw_type, req_kw_type); - for (bank = 0; bank < mcam->banks_per_entry; bank++) { - /* Set 'action' */ + /* Set HW priority */ rvu_write64(rvu, blkaddr, - NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(mcam_idx, - bank, 0), - entry->action); + NPC_AF_CN20K_MCAMEX_BANKX_CFG_EXT(mcam_idx, bank), + bank_cfg); - /* Set TAG 'action' */ - rvu_write64(rvu, blkaddr, - NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(mcam_idx, - bank, 1), - entry->vtag_action); + } else { + /* Clear mcam entry to avoid writes being suppressed by NPC */ + npc_clear_x2_entry(rvu, blkaddr, 0, mcam_idx); + npc_clear_x2_entry(rvu, blkaddr, 1, mcam_idx); - /* Set 'action2' for inline receive */ - rvu_write64(rvu, blkaddr, - NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(mcam_idx, - bank, 2), - entry->action2); + npc_cn20k_config_kw_x4(rvu, mcam, blkaddr, + mcam_idx, intf, entry, + kw_type, req_kw_type); + for (bank = 0; bank < mcam->banks_per_entry; bank++) { + /* Set 'action' */ + rvu_write64(rvu, blkaddr, + NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(mcam_idx, + bank, 0), + entry->action); + + /* Set TAG 'action' */ + rvu_write64(rvu, blkaddr, + NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(mcam_idx, + bank, 1), + entry->vtag_action); + + /* Set 'action2' for inline receive */ + rvu_write64(rvu, blkaddr, + NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(mcam_idx, + bank, 2), + entry->action2); + + /* Set HW priority */ + rvu_write64(rvu, blkaddr, + NPC_AF_CN20K_MCAMEX_BANKX_CFG_EXT(mcam_idx, bank), + bank_cfg); + } } -set_cfg: /* TODO: */ /* PF installing VF rule */ - npc_cn20k_set_mcam_bank_cfg(rvu, blkaddr, mcam_idx, bank, - kw_type, enable, hw_prio); + if (npc_cn20k_enable_mcam_entry(rvu, blkaddr, index, enable)) + return -EINVAL; return 0; } -- cgit v1.2.3 From f6803eb070bfb9a5114d16ae15053106bc7842ae Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Wed, 29 Apr 2026 07:57:19 +0530 Subject: octeontx2-af: npc: cn20k: Fix MCAM actions read npc_cn20k_read_mcam_entry() always reloaded action and vtag_action from bank 0 after programming the CAM words. Use the bank returned by npc_get_bank() for the ACTION reads as well, and read those registers once up front so both X2 and X4 paths share the same metadata. Return directly from the X2 keyword path now that the action fields are already populated. Cc: Suman Ghosh Fixes: 6d1e70282f76 ("octeontx2-af: npc: cn20k: Use common APIs") Signed-off-by: Ratheesh Kannoth Link: https://patch.msgid.link/20260429022722.1110289-8-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/marvell/octeontx2/af/cn20k/npc.c | 26 +++++++++++----------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c index 4773277fd409..bb0a9ac7aab3 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c @@ -1219,6 +1219,18 @@ int npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index, bank = npc_get_bank(mcam, index); index &= (mcam->banksize - 1); + cfg = rvu_read64(rvu, blkaddr, + NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(index, bank, 0)); + entry->action = cfg; + + cfg = rvu_read64(rvu, blkaddr, + NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(index, bank, 1)); + entry->vtag_action = cfg; + + cfg = rvu_read64(rvu, blkaddr, + NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(index, bank, 2)); + entry->action2 = cfg; + cfg = rvu_read64(rvu, blkaddr, NPC_AF_CN20K_MCAMEX_BANKX_CAMX_INTF_EXT(index, bank, 1)) & 3; @@ -1268,7 +1280,7 @@ int npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index, bank, 0)); npc_cn20k_fill_entryword(entry, kw + 3, cam0, cam1); - goto read_action; + return 0; } for (bank = 0; bank < mcam->banks_per_entry; bank++, kw = kw + 4) { @@ -1313,18 +1325,6 @@ int npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index, npc_cn20k_fill_entryword(entry, kw + 3, cam0, cam1); } -read_action: - /* 'action' is set to same value for both bank '0' and '1'. - * Hence, reading bank '0' should be enough. - */ - cfg = rvu_read64(rvu, blkaddr, - NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(index, 0, 0)); - entry->action = cfg; - - cfg = rvu_read64(rvu, blkaddr, - NPC_AF_CN20K_MCAMEX_BANKX_ACTIONX_EXT(index, 0, 1)); - entry->vtag_action = cfg; - return 0; } -- cgit v1.2.3 From afb474bd4ffc314de766afc295ac64b42856f48e Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Wed, 29 Apr 2026 07:57:20 +0530 Subject: octeontx2-af: npc: cn20k: Initialize default-rule index outputs up front npc_cn20k_dft_rules_idx_get() wrote USHRT_MAX into individual outputs only on some error paths (lbk promisc lookup, VF ucast lookup, and the PF rule walk), which could leave other caller slots stale across retries. Set every non-NULL bcast/mcast/promisc/ucast pointer to USHRT_MAX once at entry, then drop the duplicate assignments on failure. Successful lookups still overwrite the relevant slot before returning. Fixes: 09d3b7a1403f ("octeontx2-af: npc: cn20k: Allocate default MCAM indexes") Signed-off-by: Ratheesh Kannoth Link: https://patch.msgid.link/20260429022722.1110289-9-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c index bb0a9ac7aab3..b3f34b84c114 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c @@ -4016,6 +4016,13 @@ int npc_cn20k_dft_rules_idx_get(struct rvu *rvu, u16 pcifunc, u16 *bcast, void *val; int i, j; + for (i = 0; i < ARRAY_SIZE(ptr); i++) { + if (!ptr[i]) + continue; + + *ptr[i] = USHRT_MAX; + } + if (!npc_priv.init_done) return 0; @@ -4031,7 +4038,6 @@ int npc_cn20k_dft_rules_idx_get(struct rvu *rvu, u16 pcifunc, u16 *bcast, npc_dft_rule_name[NPC_DFT_RULE_PROMISC_ID], pcifunc); - *ptr[0] = USHRT_MAX; return -ESRCH; } @@ -4051,7 +4057,6 @@ int npc_cn20k_dft_rules_idx_get(struct rvu *rvu, u16 pcifunc, u16 *bcast, npc_dft_rule_name[NPC_DFT_RULE_UCAST_ID], pcifunc); - *ptr[3] = USHRT_MAX; return -ESRCH; } @@ -4071,7 +4076,6 @@ int npc_cn20k_dft_rules_idx_get(struct rvu *rvu, u16 pcifunc, u16 *bcast, __func__, npc_dft_rule_name[i], pcifunc); - *ptr[j] = USHRT_MAX; continue; } -- cgit v1.2.3 From 013717353c03b65f5b00a5cefa1515b6b45777b7 Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Wed, 29 Apr 2026 07:57:21 +0530 Subject: octeontx2-af: npc: cn20k: Tear down default MCAM rules explicitly on free npc_cn20k_dft_rules_free() used the NPC MCAM mbox "free all" path, which does not match how cn20k tracks default-rule MCAM slots indexes. Resolve the default-rule indices, then for each valid slot clear the bitmap entry, drop the PF/VF map, disable the MCAM line, clear the target function, and npc_cn20k_idx_free(). Remove any matching software mcam_rules nodes. On hard failure from idx_free, WARN and stop so the box stays up for analysis. In npc_mcam_free_all_entries(), prefetch the same default-rule indices and, on cn20k, skip bitmap clear and idx_free when the scanned entry is one of those reserved defaults (they are released by npc_cn20k_dft_rules_free). Fixes: 09d3b7a1403f ("octeontx2-af: npc: cn20k: Allocate default MCAM indexes") Signed-off-by: Ratheesh Kannoth Link: https://patch.msgid.link/20260429022722.1110289-10-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/marvell/octeontx2/af/cn20k/npc.c | 51 +++++++++++++++---- .../net/ethernet/marvell/octeontx2/af/rvu_npc.c | 59 +++++++++++++++------- 2 files changed, 82 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c index b3f34b84c114..1129565a01bd 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c @@ -4178,11 +4178,11 @@ static bool npc_is_cgx_or_lbk(struct rvu *rvu, u16 pcifunc) void npc_cn20k_dft_rules_free(struct rvu *rvu, u16 pcifunc) { - struct npc_mcam_free_entry_req free_req = { 0 }; + struct npc_mcam *mcam = &rvu->hw->mcam; + u16 ptr[4] = {[0 ... 3] = USHRT_MAX}; + struct rvu_npc_mcam_rule *rule, *tmp; unsigned long index; - struct msg_rsp rsp; - u16 ptr[4]; - int rc, i; + int blkaddr, rc, i; void *map; if (!npc_priv.init_done) @@ -4240,14 +4240,43 @@ void npc_cn20k_dft_rules_free(struct rvu *rvu, u16 pcifunc) } free_rules: + blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0); + if (blkaddr < 0) + return; + for (int i = 0; i < 4; i++) { + if (ptr[i] == USHRT_MAX) + continue; - free_req.hdr.pcifunc = pcifunc; - free_req.all = 1; - rc = rvu_mbox_handler_npc_mcam_free_entry(rvu, &free_req, &rsp); - if (rc) - dev_err(rvu->dev, - "%s: Error deleting default entries (pcifunc=%#x\n", - __func__, pcifunc); + mutex_lock(&mcam->lock); + npc_mcam_clear_bit(mcam, ptr[i]); + mcam->entry2pfvf_map[ptr[i]] = NPC_MCAM_INVALID_MAP; + npc_cn20k_enable_mcam_entry(rvu, blkaddr, ptr[i], false); + mcam->entry2target_pffunc[ptr[i]] = 0x0; + mutex_unlock(&mcam->lock); + + rc = npc_cn20k_idx_free(rvu, &ptr[i], 1); + if (rc) { + /* Non recoverable error. Let us WARN and return. Keep system alive to + * enable debugging + */ + WARN(1, "%s Error deleting default entries (pcifunc=%#x) mcam_idx=%u\n", + __func__, pcifunc, ptr[i]); + return; + } + } + + mutex_lock(&mcam->lock); + list_for_each_entry_safe(rule, tmp, &mcam->mcam_rules, list) { + for (int i = 0; i < 4; i++) { + if (ptr[i] != rule->entry) + continue; + + list_del(&rule->list); + kfree(rule); + break; + } + } + mutex_unlock(&mcam->lock); } int npc_cn20k_dft_rules_alloc(struct rvu *rvu, u16 pcifunc) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index 44ca65efc80f..5d349d131fdb 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -2521,33 +2521,58 @@ void npc_mcam_clear_bit(struct npc_mcam *mcam, u16 index) static void npc_mcam_free_all_entries(struct rvu *rvu, struct npc_mcam *mcam, int blkaddr, u16 pcifunc) { + u16 dft_idxs[NPC_DFT_RULE_MAX_ID] = {[0 ... NPC_DFT_RULE_MAX_ID - 1] = USHRT_MAX}; + bool cn20k_dft_rl; u16 index, cntr; int rc; + npc_cn20k_dft_rules_idx_get(rvu, pcifunc, + &dft_idxs[NPC_DFT_RULE_BCAST_ID], + &dft_idxs[NPC_DFT_RULE_MCAST_ID], + &dft_idxs[NPC_DFT_RULE_PROMISC_ID], + &dft_idxs[NPC_DFT_RULE_UCAST_ID]); + /* Scan all MCAM entries and free the ones mapped to 'pcifunc' */ for (index = 0; index < mcam->bmap_entries; index++) { - if (mcam->entry2pfvf_map[index] == pcifunc) { + if (mcam->entry2pfvf_map[index] != pcifunc) + continue; + + cn20k_dft_rl = false; + + if (is_cn20k(rvu->pdev)) { + if (dft_idxs[NPC_DFT_RULE_BCAST_ID] == index || + dft_idxs[NPC_DFT_RULE_MCAST_ID] == index || + dft_idxs[NPC_DFT_RULE_PROMISC_ID] == index || + dft_idxs[NPC_DFT_RULE_UCAST_ID] == index) { + cn20k_dft_rl = true; + } + } + + /* Disable the entry */ + npc_enable_mcam_entry(rvu, mcam, blkaddr, index, false); + + if (!cn20k_dft_rl) { mcam->entry2pfvf_map[index] = NPC_MCAM_INVALID_MAP; /* Free the entry in bitmap */ npc_mcam_clear_bit(mcam, index); - /* Disable the entry */ - npc_enable_mcam_entry(rvu, mcam, blkaddr, index, false); - - /* Update entry2counter mapping */ - cntr = mcam->entry2cntr_map[index]; - if (cntr != NPC_MCAM_INVALID_MAP) - npc_unmap_mcam_entry_and_cntr(rvu, mcam, - blkaddr, index, - cntr); mcam->entry2target_pffunc[index] = 0x0; - if (is_cn20k(rvu->pdev)) { - rc = npc_cn20k_idx_free(rvu, &index, 1); - if (rc) - dev_err(rvu->dev, - "Failed to free mcam idx=%u pcifunc=%#x\n", - index, pcifunc); - } } + + /* Update entry2counter mapping */ + cntr = mcam->entry2cntr_map[index]; + if (cntr != NPC_MCAM_INVALID_MAP) + npc_unmap_mcam_entry_and_cntr(rvu, mcam, + blkaddr, index, + cntr); + + if (!is_cn20k(rvu->pdev) || cn20k_dft_rl) + continue; + + rc = npc_cn20k_idx_free(rvu, &index, 1); + if (rc) + dev_err(rvu->dev, + "Failed to free mcam idx=%u pcifunc=%#x\n", + index, pcifunc); } } -- cgit v1.2.3 From bc968f61bf0ad4f085559e5e3d168105fdf88204 Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Wed, 29 Apr 2026 07:57:22 +0530 Subject: octeontx2-af: npc: cn20k: Reject missing default-rule MCAM indices When cn20k default L2 rules are not installed, npc_cn20k_dft_rules_idx_get() leaves broadcast, multicast, promiscuous, and unicast slots at USHRT_MAX. npc_get_nixlf_mcam_index() previously returned that sentinel as a valid MCAM index, so callers could program hardware with an invalid index. Return -EINVAL from the cn20k branches of npc_get_nixlf_mcam_index() when the requested slot is still USHRT_MAX. Harden cn20k NPC MCAM entry helpers to reject out-of-range indices before touching hardware. Drop the early bounds check in npc_enable_mcam_entry() for cn20k so invalid indices are validated inside npc_cn20k_enable_mcam_entry() instead of being silently ignored. In rvu_npc_update_flowkey_alg_idx(), treat negative MCAM indices like out-of-range values, and only update RSS actions for promiscuous and all-multi paths when the resolved index is non-negative. Cc: Suman Ghosh Fixes: 6d1e70282f76 ("octeontx2-af: npc: cn20k: Use common APIs") Signed-off-by: Ratheesh Kannoth Link: https://patch.msgid.link/20260429022722.1110289-11-rkannoth@marvell.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/marvell/octeontx2/af/cn20k/npc.c | 14 ++- .../net/ethernet/marvell/octeontx2/af/cn20k/npc.h | 1 + .../net/ethernet/marvell/octeontx2/af/rvu_nix.c | 3 + .../net/ethernet/marvell/octeontx2/af/rvu_npc.c | 137 +++++++++++++++++++-- .../net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c | 10 +- 5 files changed, 155 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c index 1129565a01bd..6b3f453fd500 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.c @@ -808,6 +808,9 @@ npc_cn20k_enable_mcam_entry(struct rvu *rvu, int blkaddr, u64 cfg, hw_prio; u8 kw_type; + if (index < 0 || index >= mcam->total_entries) + return -EINVAL; + if (npc_mcam_idx_2_key_type(rvu, index, &kw_type)) return -EINVAL; @@ -1056,6 +1059,9 @@ int npc_cn20k_config_mcam_entry(struct rvu *rvu, int blkaddr, int index, int kw = 0; u8 kw_type; + if (index < 0 || index >= mcam->total_entries) + return -EINVAL; + if (npc_mcam_idx_2_key_type(rvu, index, &kw_type)) return -EINVAL; @@ -1148,6 +1154,9 @@ int npc_cn20k_copy_mcam_entry(struct rvu *rvu, int blkaddr, u16 src, u16 dest) int bank, i, sb, db; int dbank, sbank; + if (src >= mcam->total_entries || dest >= mcam->total_entries) + return -EINVAL; + dbank = npc_get_bank(mcam, dest); sbank = npc_get_bank(mcam, src); @@ -1213,6 +1222,9 @@ int npc_cn20k_read_mcam_entry(struct rvu *rvu, int blkaddr, u16 index, int kw = 0, bank; u8 kw_type; + if (index >= mcam->total_entries) + return -EINVAL; + if (npc_mcam_idx_2_key_type(rvu, index, &kw_type)) return -EINVAL; @@ -4170,7 +4182,7 @@ int rvu_mbox_handler_npc_get_dft_rl_idxs(struct rvu *rvu, struct msg_req *req, return 0; } -static bool npc_is_cgx_or_lbk(struct rvu *rvu, u16 pcifunc) +bool npc_is_cgx_or_lbk(struct rvu *rvu, u16 pcifunc) { return is_pf_cgxmapped(rvu, rvu_get_pf(rvu->pdev, pcifunc)) || is_lbk_vf(rvu, pcifunc); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h index 2f761b97f91b..3d5eb952cc07 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/cn20k/npc.h @@ -335,5 +335,6 @@ int npc_mcam_idx_2_key_type(struct rvu *rvu, u16 mcam_idx, u8 *key_type); u16 npc_cn20k_vidx2idx(u16 index); u16 npc_cn20k_idx2vidx(u16 idx); int npc_cn20k_defrag(struct rvu *rvu); +bool npc_is_cgx_or_lbk(struct rvu *rvu, u16 pcifunc); #endif /* NPC_CN20K_H */ diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c index ef5b081162eb..f977734ae712 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c @@ -3577,6 +3577,9 @@ static int nix_update_mce_rule(struct rvu *rvu, u16 pcifunc, mcam_index = npc_get_nixlf_mcam_index(mcam, pcifunc & ~RVU_PFVF_FUNC_MASK, nixlf, type); + if (mcam_index < 0) + return -EINVAL; + err = nix_update_mce_list(rvu, pcifunc, mce_list, mce_idx, mcam_index, add); return err; diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index 5d349d131fdb..3c814d157ab9 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -163,14 +163,35 @@ int npc_get_nixlf_mcam_index(struct npc_mcam *mcam, if (rc) return -EFAULT; + if (is_lbk_vf(rvu, pcifunc)) { + if (promisc == USHRT_MAX) + return -EINVAL; + return promisc; + } + + if (is_cgx_vf(rvu, pcifunc)) { + if (ucast == USHRT_MAX) + return -EINVAL; + + return ucast; + } + switch (type) { case NIXLF_BCAST_ENTRY: + if (bcast == USHRT_MAX) + return -EINVAL; return bcast; case NIXLF_ALLMULTI_ENTRY: + if (mcast == USHRT_MAX) + return -EINVAL; return mcast; case NIXLF_PROMISC_ENTRY: + if (promisc == USHRT_MAX) + return -EINVAL; return promisc; case NIXLF_UCAST_ENTRY: + if (ucast == USHRT_MAX) + return -EINVAL; return ucast; default: return -EINVAL; @@ -238,9 +259,6 @@ void npc_enable_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam, int actbank = bank; if (is_cn20k(rvu->pdev)) { - if (index < 0 || index >= mcam->banksize * mcam->banks) - return; - if (npc_cn20k_enable_mcam_entry(rvu, blkaddr, index, enable)) dev_err(rvu->dev, "Error to %s mcam %u entry\n", enable ? "enable" : "disable", index); @@ -434,6 +452,15 @@ static u64 npc_get_default_entry_action(struct rvu *rvu, struct npc_mcam *mcam, index = npc_get_nixlf_mcam_index(mcam, pf_func, nixlf, NIXLF_UCAST_ENTRY); + + if (index < 0) { + dev_err(rvu->dev, + "%s: failed to get ucast entry pcifunc:0x%x\n", + __func__, pf_func); + /* Action 0 is drop */ + return 0; + } + bank = npc_get_bank(mcam, index); index &= (mcam->banksize - 1); @@ -700,6 +727,12 @@ void rvu_npc_install_ucast_entry(struct rvu *rvu, u16 pcifunc, index = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_UCAST_ENTRY); + if (index < 0) { + dev_err(rvu->dev, + "%s: Error to get ucast entry for pcifunc=%#x\n", + __func__, pcifunc); + return; + } /* Don't change the action if entry is already enabled * Otherwise RSS action may get overwritten. @@ -755,11 +788,21 @@ void rvu_npc_install_promisc_entry(struct rvu *rvu, u16 pcifunc, index = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_PROMISC_ENTRY); + /* In cn20k, default indexes are installed only for CGX mapped + * and lbk interfaces + */ if (is_cgx_vf(rvu, pcifunc)) index = npc_get_nixlf_mcam_index(mcam, pcifunc & ~RVU_PFVF_FUNC_MASK, nixlf, NIXLF_PROMISC_ENTRY); + if (index < 0) { + dev_err(rvu->dev, + "%s: Error to get promisc entry for pcifunc=%#x\n", + __func__, pcifunc); + return; + } + /* If the corresponding PF's ucast action is RSS, * use the same action for promisc also * Please note that for lbk(s) "index" and "ucast_idx" @@ -770,6 +813,12 @@ void rvu_npc_install_promisc_entry(struct rvu *rvu, u16 pcifunc, else ucast_idx = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_UCAST_ENTRY); + if (ucast_idx < 0) { + dev_err(rvu->dev, + "%s: Error to get ucast/promisc entry for pcifunc=%#x\n", + __func__, pcifunc); + return; + } if (is_mcam_entry_enabled(rvu, mcam, blkaddr, ucast_idx)) *(u64 *)&action = npc_get_mcam_action(rvu, mcam, @@ -844,6 +893,14 @@ void rvu_npc_enable_promisc_entry(struct rvu *rvu, u16 pcifunc, index = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_PROMISC_ENTRY); + + if (index < 0) { + dev_err(rvu->dev, + "%s: Error to get promisc entry for pcifunc=%#x\n", + __func__, pcifunc); + return; + } + npc_enable_mcam_entry(rvu, mcam, blkaddr, index, enable); } @@ -884,6 +941,12 @@ void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc, index = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_BCAST_ENTRY); + if (index < 0) { + dev_err(rvu->dev, + "%s: Error to get bcast entry for pcifunc=%#x\n", + __func__, pcifunc); + return; + } if (!hw->cap.nix_rx_multicast) { /* Early silicon doesn't support pkt replication, @@ -948,12 +1011,25 @@ void rvu_npc_install_allmulti_entry(struct rvu *rvu, u16 pcifunc, int nixlf, index = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_ALLMULTI_ENTRY); + if (index < 0) { + dev_err(rvu->dev, + "%s: Error to get mcast entry for pcifunc=%#x\n", + __func__, pcifunc); + return; + } /* If the corresponding PF's ucast action is RSS, * use the same action for multicast entry also */ ucast_idx = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_UCAST_ENTRY); + if (ucast_idx < 0) { + dev_err(rvu->dev, + "%s: Error to get ucast entry for pcifunc=%#x\n", + __func__, pcifunc); + return; + } + if (is_mcam_entry_enabled(rvu, mcam, blkaddr, ucast_idx)) *(u64 *)&action = npc_get_mcam_action(rvu, mcam, blkaddr, ucast_idx); @@ -1018,6 +1094,13 @@ void rvu_npc_enable_allmulti_entry(struct rvu *rvu, u16 pcifunc, int nixlf, index = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_ALLMULTI_ENTRY); + if (index < 0) { + dev_err(rvu->dev, + "%s: Error to get mcast entry for pcifunc=%#x\n", + __func__, pcifunc); + return; + } + npc_enable_mcam_entry(rvu, mcam, blkaddr, index, enable); } @@ -1130,8 +1213,12 @@ void rvu_npc_update_flowkey_alg_idx(struct rvu *rvu, u16 pcifunc, int nixlf, index = mcam_index; } - if (index >= mcam->total_entries) + if (index < 0 || index >= mcam->total_entries) { + dev_err(rvu->dev, + "%s: Invalid mcam index, pcifunc=%#x\n", + __func__, pcifunc); return; + } bank = npc_get_bank(mcam, index); index &= (mcam->banksize - 1); @@ -1175,16 +1262,18 @@ void rvu_npc_update_flowkey_alg_idx(struct rvu *rvu, u16 pcifunc, int nixlf, /* If PF's promiscuous entry is enabled, * Set RSS action for that entry as well */ - npc_update_rx_action_with_alg_idx(rvu, action, pfvf, index, - blkaddr, alg_idx); + if (index >= 0) + npc_update_rx_action_with_alg_idx(rvu, action, pfvf, index, + blkaddr, alg_idx); index = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_ALLMULTI_ENTRY); /* If PF's allmulti entry is enabled, * Set RSS action for that entry as well */ - npc_update_rx_action_with_alg_idx(rvu, action, pfvf, index, - blkaddr, alg_idx); + if (index >= 0) + npc_update_rx_action_with_alg_idx(rvu, action, pfvf, index, + blkaddr, alg_idx); } } @@ -1197,12 +1286,22 @@ void npc_enadis_default_mce_entry(struct rvu *rvu, u16 pcifunc, int index, blkaddr, mce_idx; struct rvu_pfvf *pfvf; + /* multicast pkt replication is not enabled for AF's VFs & SDP links */ + if (is_lbk_vf(rvu, pcifunc) || is_sdp_pfvf(rvu, pcifunc)) + return; + blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0); if (blkaddr < 0) return; index = npc_get_nixlf_mcam_index(mcam, pcifunc & ~RVU_PFVF_FUNC_MASK, nixlf, type); + if (index < 0) { + dev_err(rvu->dev, + "%s: Error to get entry for pcifunc=%#x, type=%u\n", + __func__, pcifunc, type); + return; + } /* disable MCAM entry when packet replication is not supported by hw */ if (!hw->cap.nix_rx_multicast && !is_vf(pcifunc)) { @@ -1231,6 +1330,10 @@ static void npc_enadis_default_entries(struct rvu *rvu, u16 pcifunc, struct npc_mcam *mcam = &rvu->hw->mcam; int index, blkaddr; + /* only CGX or LBK interfaces have default entries */ + if (is_cn20k(rvu->pdev) && !npc_is_cgx_or_lbk(rvu, pcifunc)) + return; + blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0); if (blkaddr < 0) return; @@ -1240,6 +1343,12 @@ static void npc_enadis_default_entries(struct rvu *rvu, u16 pcifunc, pfvf->nix_rx_intf)) { index = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_UCAST_ENTRY); + if (index < 0) { + dev_err(rvu->dev, + "%s: Error to get ucast entry for pcifunc=%#x\n", + __func__, pcifunc); + return; + } npc_enable_mcam_entry(rvu, mcam, blkaddr, index, enable); } @@ -3897,6 +4006,12 @@ int rvu_mbox_handler_npc_read_base_steer_rule(struct rvu *rvu, /* Read the default ucast entry if there is no pkt steering rule */ index = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_UCAST_ENTRY); + if (index < 0) { + mutex_unlock(&mcam->lock); + rc = NIX_AF_ERR_AF_LF_INVALID; + goto out; + } + read_entry: /* Read the mcam entry */ npc_read_mcam_entry(rvu, mcam, blkaddr, index, &rsp->entry, &intf, @@ -3970,6 +4085,12 @@ void rvu_npc_clear_ucast_entry(struct rvu *rvu, int pcifunc, int nixlf) ucast_idx = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, NIXLF_UCAST_ENTRY); + if (ucast_idx < 0) { + dev_err(rvu->dev, + "%s: Error to get ucast entry for pcifunc=%#x\n", + __func__, pcifunc); + return; + } npc_enable_mcam_entry(rvu, mcam, blkaddr, ucast_idx, false); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c index fe10554b1f0e..6ae9cdcb608b 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c @@ -1444,7 +1444,7 @@ static int npc_install_flow(struct rvu *rvu, int blkaddr, u16 target, struct msg_rsp write_rsp; struct mcam_entry *entry; bool new = false; - u16 entry_index; + int entry_index; int err; installed_features = req->features; @@ -1477,6 +1477,14 @@ static int npc_install_flow(struct rvu *rvu, int blkaddr, u16 target, if (req->default_rule) { entry_index = npc_get_nixlf_mcam_index(mcam, target, nixlf, NIXLF_UCAST_ENTRY); + + if (entry_index < 0) { + dev_err(rvu->dev, + "%s: Error to get ucast entry for target=%#x\n", + __func__, target); + return -EINVAL; + } + enable = is_mcam_entry_enabled(rvu, mcam, blkaddr, entry_index); } -- cgit v1.2.3 From baa3c65435fb3f450b262672bc06db887a92d397 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 30 Apr 2026 21:55:01 +0200 Subject: netfilter: flowtable: use skb_pull_rcsum() to pop vlan/pppoe header This adjusts the checksum, if required, after pulling the layer 2 header, either the pppoe header or the inner vlan header in the double-tagged vlan packets. Fixes: 4cd91f7c290f ("netfilter: flowtable: add vlan support") Fixes: 72efd585f714 ("netfilter: flowtable: add pppoe support") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 2eba64eb393a..9c05a50d6013 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -445,13 +445,13 @@ static void nf_flow_encap_pop(struct nf_flowtable_ctx *ctx, switch (skb->protocol) { case htons(ETH_P_8021Q): vlan_hdr = (struct vlan_hdr *)skb->data; - __skb_pull(skb, VLAN_HLEN); + skb_pull_rcsum(skb, VLAN_HLEN); vlan_set_encap_proto(skb, vlan_hdr); skb_reset_network_header(skb); break; case htons(ETH_P_PPP_SES): skb->protocol = __nf_flow_pppoe_proto(skb); - skb_pull(skb, PPPOE_SES_HLEN); + skb_pull_rcsum(skb, PPPOE_SES_HLEN); skb_reset_network_header(skb); break; } -- cgit v1.2.3 From f93836b236773862e9ee268a82e3614caf77ea01 Mon Sep 17 00:00:00 2001 From: Aleksander Jan Bajkowski Date: Thu, 30 Apr 2026 23:34:33 +0200 Subject: net: usb: r8152: add TRENDnet TUC-ET2G v2.0 The TRENDnet TUC-ET2G V2.0 is an RTL8156B based 2.5G Ethernet controller. Add the vendor and product ID values to the driver. This makes Ethernet work with the adapter. Signed-off-by: Aleksander Jan Bajkowski Reviewed-by: Andrew Lunn Reviewed-by: Birger Koblitz Link: https://patch.msgid.link/20260430213435.21821-1-olek2@wp.pl Signed-off-by: Jakub Kicinski --- drivers/net/usb/r8152.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 7337bf1b7d6a..1ace1d2398c9 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -10138,6 +10138,7 @@ static const struct usb_device_id rtl8152_table[] = { { USB_DEVICE(VENDOR_ID_DELL, 0xb097) }, { USB_DEVICE(VENDOR_ID_ASUS, 0x1976) }, { USB_DEVICE(VENDOR_ID_TRENDNET, 0xe02b) }, + { USB_DEVICE(VENDOR_ID_TRENDNET, 0xe02c) }, {} }; -- cgit v1.2.3 From 4f34002e2e37639133693c13a2a9977fab86880d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2026 07:06:11 +0000 Subject: ipmr: prevent info-leak in pmr_cache_report() Yiming Qian reported: ipmr_cache_report()` allocates a report skb with `alloc_skb(128, GFP_ATOMIC)` and appends a `struct igmphdr` using `skb_put()`. In the non-`IGMPMSG_WHOLEPKT` path it initializes only: - `igmp->type` - `igmp->code` but does not initialize: - `igmp->csum` - `igmp->group` Later, `igmpmsg_netlink_event()` copies the bytes after `sizeof(struct igmpmsg)` into the `IPMRA_CREPORT_PKT` netlink attribute and emits `RTM_NEWCACHEREPORT` on `RTNLGRP_IPV4_MROUTE_R`. As a result, 6 bytes of stale heap data from the skb head are disclosed to userspace. Let's use skb_put_zero() instead of skb_put() to fix this bug. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Yiming Qian Signed-off-by: Eric Dumazet Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260430070611.4004529-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 2058ca860294..05fb6eefe0be 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1112,11 +1112,12 @@ static int ipmr_cache_report(const struct mr_table *mrt, msg->im_vif_hi = vifi >> 8; ipv4_pktinfo_prepare(mroute_sk, pkt, false); memcpy(skb->cb, pkt->cb, sizeof(skb->cb)); - /* Add our header */ - igmp = skb_put(skb, sizeof(struct igmphdr)); + /* Add our header. + * Note that code, csum and group fields are cleared. + */ + igmp = skb_put_zero(skb, sizeof(struct igmphdr)); igmp->type = assert; msg->im_msgtype = assert; - igmp->code = 0; ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ skb->transport_header = skb->network_header; } -- cgit v1.2.3 From 4b9e327991815e128ad3af75c3a04630a63ce3e0 Mon Sep 17 00:00:00 2001 From: Kai Zen Date: Thu, 30 Apr 2026 18:26:48 +0300 Subject: net: rtnetlink: zero ifla_vf_broadcast to avoid stack infoleak in rtnl_fill_vfinfo rtnl_fill_vfinfo() declares struct ifla_vf_broadcast on the stack without initialisation: struct ifla_vf_broadcast vf_broadcast; The struct contains a single fixed 32-byte field: /* include/uapi/linux/if_link.h */ struct ifla_vf_broadcast { __u8 broadcast[32]; }; The function then copies dev->broadcast into it using dev->addr_len as the length: memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len); On Ethernet devices (the overwhelming majority of SR-IOV NICs) dev->addr_len is 6, so only the first 6 bytes of broadcast[] are written. The remaining 26 bytes retain whatever was previously on the kernel stack. The full struct is then handed to userspace via: nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) leaking up to 26 bytes of uninitialised kernel stack per VF per RTM_GETLINK request, repeatable. The other vf_* structs in the same function are explicitly zeroed for exactly this reason - see the memset() calls for ivi, vf_vlan_info, node_guid and port_guid a few lines above. vf_broadcast was simply missed when it was added. Reachability: any unprivileged local process can open AF_NETLINK / NETLINK_ROUTE without capabilities and send RTM_GETLINK with an IFLA_EXT_MASK attribute carrying RTEXT_FILTER_VF. The kernel walks each VF and emits IFLA_VF_BROADCAST, leaking 26 bytes of stack per VF per request. Stack residue at this call site can include return addresses and transient sensitive data; KASAN with stack instrumentation, or KMSAN, will flag the nla_put() when reproduced. Zero the on-stack struct before the partial memcpy, matching the existing pattern used for the other vf_* structs in the same function. Fixes: 75345f888f70 ("ipoib: show VF broadcast address") Cc: stable@vger.kernel.org Signed-off-by: Kai Zen Link: https://patch.msgid.link/3c506e8f936e52b57620269b55c348af05d413a2.1777557228.git.kai.aizen.dev@gmail.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b613bb6e07df..df042da422ef 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1572,6 +1572,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, port_guid.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); + memset(&vf_broadcast, 0, sizeof(vf_broadcast)); memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; -- cgit v1.2.3 From c6bebaa744f7579eb72800a262fbfeb93e40db04 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2026 16:48:36 +0000 Subject: ipv4: igmp: annotate data-races in igmp_heard_query() Multiple cpus can run igmp_heard_query() concurrently. Add missing READ_ONCE()/WRITE_ONCE() over following in_dev fields. - mr_qrv - mr_qi - mr_qri - mr_v1_seen - mr_v2_seen Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+ae9a171f239b14485310@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/69f38675.050a0220.3cbe47.0002.GAE@google.com Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260430164836.872079-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/igmp.c | 58 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index a674fb44ec25..a9ad39064f3b 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -122,16 +122,29 @@ * contradict to specs provided this delay is small enough. */ -#define IGMP_V1_SEEN(in_dev) \ - (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \ - IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \ - ((in_dev)->mr_v1_seen && \ - time_before(jiffies, (in_dev)->mr_v1_seen))) -#define IGMP_V2_SEEN(in_dev) \ - (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \ - IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \ - ((in_dev)->mr_v2_seen && \ - time_before(jiffies, (in_dev)->mr_v2_seen))) +static bool IGMP_V1_SEEN(const struct in_device *in_dev) +{ + unsigned long seen; + + if (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1) + return true; + if (IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1) + return true; + seen = READ_ONCE(in_dev->mr_v1_seen); + return seen && time_before(jiffies, seen); +} + +static bool IGMP_V2_SEEN(const struct in_device *in_dev) +{ + unsigned long seen; + + if (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2) + return true; + if (IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2) + return true; + seen = READ_ONCE(in_dev->mr_v2_seen); + return seen && time_before(jiffies, seen); +} static int unsolicited_report_interval(struct in_device *in_dev) { @@ -954,23 +967,21 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, int max_delay; int mark = 0; struct net *net = dev_net(in_dev->dev); - + unsigned long seen; if (len == 8) { + seen = jiffies + READ_ONCE(in_dev->mr_qrv) * READ_ONCE(in_dev->mr_qi) + + READ_ONCE(in_dev->mr_qri); if (ih->code == 0) { /* Alas, old v1 router presents here. */ max_delay = IGMP_QUERY_RESPONSE_INTERVAL; - in_dev->mr_v1_seen = jiffies + - (in_dev->mr_qrv * in_dev->mr_qi) + - in_dev->mr_qri; + WRITE_ONCE(in_dev->mr_v1_seen, seen); group = 0; } else { /* v2 router present */ max_delay = ih->code*(HZ/IGMP_TIMER_SCALE); - in_dev->mr_v2_seen = jiffies + - (in_dev->mr_qrv * in_dev->mr_qi) + - in_dev->mr_qri; + WRITE_ONCE(in_dev->mr_v2_seen, seen); } /* cancel the interface change timer */ WRITE_ONCE(in_dev->mr_ifc_count, 0); @@ -995,6 +1006,8 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, if (!max_delay) max_delay = 1; /* can't mod w/ 0 */ } else { /* v3 */ + unsigned long mr_qi; + if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) return true; @@ -1015,15 +1028,16 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, * received value was zero, use the default or statically * configured value. */ - in_dev->mr_qrv = ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); - in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL; - + WRITE_ONCE(in_dev->mr_qrv, + ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv)); + mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL; + WRITE_ONCE(in_dev->mr_qi, mr_qi); /* RFC3376, 8.3. Query Response Interval: * The number of seconds represented by the [Query Response * Interval] must be less than the [Query Interval]. */ - if (in_dev->mr_qri >= in_dev->mr_qi) - in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ; + if (READ_ONCE(in_dev->mr_qri) >= mr_qi) + WRITE_ONCE(in_dev->mr_qri, (mr_qi/HZ - 1) * HZ); if (!group) { /* general query */ if (ih3->nsrcs) -- cgit v1.2.3 From a5148bc2fa27092862ac4b9e7b5c8340d60cff34 Mon Sep 17 00:00:00 2001 From: Alex Cheema Date: Wed, 29 Apr 2026 18:57:39 +0100 Subject: net: usb: cdc_ncm: add Apple Mac USB-C direct networking quirk Apple Silicon Macs expose two CDC NCM "private" data interfaces over USB-C with VID:PID 0x05ac:0x1905 and product string "Mac". This is the same protocol Apple already ships on iPhone (0x05ac:0x12a8) and iPad (0x05ac:0x12ab) for RemoteXPC since iOS 17 -- both data interfaces lack an interrupt status endpoint, so they rely on the FLAG_LINK_INTR- conditional bind path introduced in commit 3ec8d7572a69 ("CDC-NCM: add support for Apple's private interface"). The id_table currently has entries for iPhone and iPad but not for the Mac. Without a match, cdc_ncm falls through to the generic CDC NCM class-match entry, which uses the FLAG_LINK_INTR-having cdc_ncm_info struct, so bind_common() fails on the missing status endpoint and no netdev appears. Add id_table entries for both interface numbers (0 and 2) of the Mac, bound to the existing apple_private_interface_info driver_info. Verified empirically on a Mac Studio M3 Ultra running macOS 26.5: when a Mac is connected via USB-C, ioreg shows VID 0x05ac, PID 0x1905, product string "Mac", with two NCM data interfaces at numbers 0 and 2. The same PID is presented by all current Apple Silicon Mac models (MacBook Pro/Air, Mac mini, Mac Studio across the M-series), mirroring Apple's single-PID-per-family pattern from iPhone/iPad. After this patch, plugging a Mac into a Linux host running the patched kernel produces two enx... interfaces (one per data interface), "ip -br link" lists them as UP, and standard userspace networking (DHCP, NetworkManager shared mode, etc.) works without any modprobe overrides or out-of-tree modules. Signed-off-by: Alex Cheema Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260429175739.34426-1-alex@exolabs.net Signed-off-by: Jakub Kicinski --- drivers/net/usb/cdc_ncm.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index bb9929727eb9..0223a172851e 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -2012,6 +2012,14 @@ static const struct usb_device_id cdc_devs[] = { .driver_info = (unsigned long)&apple_private_interface_info, }, + /* Mac */ + { USB_DEVICE_INTERFACE_NUMBER(0x05ac, 0x1905, 0), + .driver_info = (unsigned long)&apple_private_interface_info, + }, + { USB_DEVICE_INTERFACE_NUMBER(0x05ac, 0x1905, 2), + .driver_info = (unsigned long)&apple_private_interface_info, + }, + /* Ericsson MBM devices like F5521gw */ { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_VENDOR, -- cgit v1.2.3 From 6d4106e8df94c0c52cf3ca6a6a0d01567fb3844e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2026 08:00:56 +0000 Subject: net/sched: sch_pie: annotate more data-races in pie_dump_stats() My prior patch missed few READ_ONCE()/WRITE_ONCE() annotations. Fixes: 5154561d9b11 ("net/sched: sch_pie: annotate data-races in pie_dump_stats()") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260430080056.35104-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_pie.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index fb53fbf0e328..b41f2def2e2c 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -219,16 +219,14 @@ void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params, * packet timestamp. */ if (!params->dq_rate_estimator) { - vars->qdelay = now - pie_get_enqueue_time(skb); + WRITE_ONCE(vars->qdelay, + backlog ? now - pie_get_enqueue_time(skb) : 0); if (vars->dq_tstamp != DTIME_INVALID) dtime = now - vars->dq_tstamp; vars->dq_tstamp = now; - if (backlog == 0) - vars->qdelay = 0; - if (dtime == 0) return; @@ -376,7 +374,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC))) delta += MAX_PROB / (100 / 2); - vars->prob += delta; + WRITE_ONCE(vars->prob, vars->prob + delta); if (delta > 0) { /* prevent overflow */ @@ -401,7 +399,7 @@ void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars, if (qdelay == 0 && qdelay_old == 0 && update_prob) /* Reduce drop probability to 98.4% */ - vars->prob -= vars->prob / 64; + WRITE_ONCE(vars->prob, vars->prob - vars->prob / 64); WRITE_ONCE(vars->qdelay, qdelay); vars->backlog_old = backlog; @@ -501,7 +499,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct pie_sched_data *q = qdisc_priv(sch); struct tc_pie_xstats st = { - .prob = q->vars.prob << BITS_PER_BYTE, + .prob = READ_ONCE(q->vars.prob) << BITS_PER_BYTE, .delay = ((u32)PSCHED_TICKS2NS(READ_ONCE(q->vars.qdelay))) / NSEC_PER_USEC, .packets_in = READ_ONCE(q->stats.packets_in), @@ -512,7 +510,7 @@ static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) }; /* avg_dq_rate is only valid if dq_rate_estimator is enabled */ - st.dq_rate_estimating = q->params.dq_rate_estimator; + st.dq_rate_estimating = READ_ONCE(q->params.dq_rate_estimator); /* unscale and return dq_rate in bytes per sec */ if (st.dq_rate_estimating) -- cgit v1.2.3 From 4bc852006b62eae8ea77e797192d089367e854ff Mon Sep 17 00:00:00 2001 From: Sagarika Sharma Date: Thu, 30 Apr 2026 20:09:00 +0000 Subject: ipv6: update route serial number on NETDEV_CHANGE When using IPv6 ECMP routes, if a netdev listed as a nexthop experiences a carrier change event (e.g., a bond device generating a NETDEV_CHANGE event after its slaves go linkdown), established connections utilizing that nexthop fail to fail over to other available nexthops. Instead, these connections stall or drop. This happens because the IPv6 FIB code does not invalidate the socket's cached destination when a NETDEV_CHANGE event occurs. While fib6_ifdown() correctly marks the nexthop with RTNH_F_LINKDOWN, it leaves the route's serial number unchanged. As a result, sockets with a previously cached dst do not realize the route is no longer viable and continue to try using the non-functional nexthop. This behavior contrasts with IPv4, which actively flushes cached destinations on a NETDEV_CHANGE event (see fib_netdev_event() in net/ipv4/fib_frontend.c). Fix this by updating the route serial number in fib6_ifdown() when setting RTNH_F_LINKDOWN. This invalidates stale cached destinations, forcing sockets to perform a new route lookup and fail over to a functioning nexthop. Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)") Signed-off-by: Sagarika Sharma Reviewed-by: Kuniyuki Iwashima Reviewed-by: Ido Schimmel Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260430200909.527827-2-sharmasagarika@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 19eb6b702227..0dc0316530ca 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4995,6 +4995,7 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) break; rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; + fib6_update_sernum(net, rt); rt6_multipath_rebalance(rt); break; } -- cgit v1.2.3 From d1ae37dc6881a6a9113c8545cdbba731393d8dcd Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 30 Apr 2026 20:09:01 +0000 Subject: selftest: net: Add test for TCP flow failover with ECMP routes. Without the previous commit, TCP failed to switch to alternative IPv6 routes immediately upon carrier loss. It would persist with the dead route until reaching the threshold net.ipv4.tcp_retries1, leading to unnecessary delays in failover. Let's add a selftest for this scenario to ensure TCP fails over immediately upon a carrier loss event. Before: TEST: TCP IPv4 failover [ OK ] TEST: TCP IPv6 failover [FAIL] After: TEST: TCP IPv4 failover [ OK ] TEST: TCP IPv6 failover [ OK ] Signed-off-by: Kuniyuki Iwashima Signed-off-by: Sagarika Sharma Link: https://patch.msgid.link/20260430200909.527827-3-sharmasagarika@google.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/tcp_ecmp_failover.sh | 216 +++++++++++++++++++++++ 2 files changed, 217 insertions(+) create mode 100755 tools/testing/selftests/net/tcp_ecmp_failover.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index a275ed584026..f3da38c54d27 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -96,6 +96,7 @@ TEST_PROGS := \ srv6_hl2encap_red_l2vpn_test.sh \ srv6_iptunnel_cache.sh \ stress_reuseport_listen.sh \ + tcp_ecmp_failover.sh \ tcp_fastopen_backup_key.sh \ test_bpf.sh \ test_bridge_backup_port.sh \ diff --git a/tools/testing/selftests/net/tcp_ecmp_failover.sh b/tools/testing/selftests/net/tcp_ecmp_failover.sh new file mode 100755 index 000000000000..5768aa8bff6a --- /dev/null +++ b/tools/testing/selftests/net/tcp_ecmp_failover.sh @@ -0,0 +1,216 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright 2026 Google LLC. +# +# This test verifies TCP flow failover between ECMP routes +# upon carrier loss on the active device. +# +# socat -----------------------------> socat +# | +# .-- veth-c1 -|- veth-s1 --. +# dummy0 -| | |-- dummy0 +# '-- veth-c2 -|- veth-s2 --' +# | +# + +REQUIRE_JQ=no +REQUIRE_MZ=no +NUM_NETIFS=0 + +source forwarding/lib.sh + +CLIENT_IP="10.0.59.1" +SERVER_IP="10.0.92.1" +CLIENT_IP6="2001:db8:5a9a::1" +SERVER_IP6="2001:db8:9292::1" + +setup_server() +{ + IP="ip -n $server" + NS_EXEC="ip netns exec $server" + + $IP link add dummy0 type dummy + $IP link set dummy0 up + + $IP -4 addr add $SERVER_IP/32 dev dummy0 + $IP -6 addr add $SERVER_IP6/128 dev dummy0 nodad + + $IP link set veth-s1 up + $IP link set veth-s2 up + + $IP -4 addr add 192.168.1.2/24 dev veth-s1 + $IP -4 addr add 192.168.2.2/24 dev veth-s2 + + $IP -4 route add $CLIENT_IP/32 \ + nexthop via 192.168.1.1 dev veth-s1 weight 1 \ + nexthop via 192.168.2.1 dev veth-s2 weight 1 + + $IP -6 addr add 2001:db8:1::2/64 dev veth-s1 nodad + $IP -6 addr add 2001:db8:2::2/64 dev veth-s2 nodad + + $IP -6 route add $CLIENT_IP6/128 \ + nexthop via 2001:db8:1::1 dev veth-s1 weight 1 \ + nexthop via 2001:db8:2::1 dev veth-s2 weight 1 +} + +setup_client() +{ + IP="ip -n $client" + NS_EXEC="ip netns exec $client" + + $IP link add dummy0 type dummy + $IP link set dummy0 up + + $IP -4 addr add $CLIENT_IP/32 dev dummy0 + $IP -6 addr add $CLIENT_IP6/128 dev dummy0 nodad + + $IP link set veth-c1 up + $IP link set veth-c2 up + + $IP -4 addr add 192.168.1.1/24 dev veth-c1 + $IP -4 addr add 192.168.2.1/24 dev veth-c2 + + $IP -4 route add $SERVER_IP/32 \ + nexthop via 192.168.1.2 dev veth-c1 weight 1 \ + nexthop via 192.168.2.2 dev veth-c2 weight 1 + + $IP -6 addr add 2001:db8:1::1/64 dev veth-c1 nodad + $IP -6 addr add 2001:db8:2::1/64 dev veth-c2 nodad + + $IP -6 route add $SERVER_IP6/128 \ + nexthop via 2001:db8:1::2 dev veth-c1 weight 1 \ + nexthop via 2001:db8:2::2 dev veth-c2 weight 1 + + # By default, tcp_retries1=3 triggers a route refresh + # after 3 retransmits (~5s). Ensure this never occurs + # for test stability. + $NS_EXEC sysctl -qw net.ipv4.tcp_retries1=100 + + # When NETDEV_CHANGE is issued for a dev tied to an ECMP + # route, RTNH_F_LINKDOWN is flagged and the sernum is + # bumped to invalidate the route via sk_dst_check(). + # + # Without ignore_routes_with_linkdown=1, subsequent + # lookups may still select the same RTNH_F_LINKDOWN route. + $NS_EXEC sysctl -qw net.ipv4.conf.veth-c1.ignore_routes_with_linkdown=1 + $NS_EXEC sysctl -qw net.ipv4.conf.veth-c2.ignore_routes_with_linkdown=1 + + $NS_EXEC sysctl -qw net.ipv6.conf.veth-c1.ignore_routes_with_linkdown=1 + $NS_EXEC sysctl -qw net.ipv6.conf.veth-c2.ignore_routes_with_linkdown=1 +} + +setup() +{ + setup_ns client server + + ip -n "$client" link add veth-c1 type veth peer veth-s1 netns "$server" + ip -n "$client" link add veth-c2 type veth peer veth-s2 netns "$server" + + setup_server + setup_client +} + +cleanup() +{ + cleanup_all_ns > /dev/null 2>&1 +} + +tcp_ecmp_failover() +{ + local pf=$1; shift + local server_ip=$1; shift + local client_ip=$1; shift + + RET=0 + + tcpdump_start veth-s1 "$server" + tcpdump_start veth-s2 "$server" + + ip netns exec "$server" \ + socat -u TCP-LISTEN:8080,pf="$pf",bind="$server_ip",reuseaddr /dev/null & + server_pid=$! + + # Wait for server to start listening. + # Sometimes client fails without this sleep. + sleep 1 + + ip netns exec "$client" \ + socat -u /dev/zero TCP:"$server_ip":8080,pf="$pf",bind="$client_ip" & + client_pid=$! + + # To capture enough packets. + sleep 3 + + tcpdump_stop veth-s1 + tcpdump_stop veth-s2 + + pkts_s1=$(tcpdump_show veth-s1 | wc -l) + pkts_s2=$(tcpdump_show veth-s2 | wc -l) + + tcpdump_cleanup veth-s1 + tcpdump_cleanup veth-s2 + + # Detect the device chosen by the client + if [ "$pkts_s1" -gt "$pkts_s2" ]; then + veth_down=veth-s1 + veth_up=veth-s2 + else + veth_down=veth-s2 + veth_up=veth-s1 + fi + + # Taking down $veth_down causes its peer to lose carrier, + # triggering NETDEV_CHANGE. This flags RTNH_F_LINKDOWN + # and bumps the sernum for the route associated with that + # peer, invalidating the cached dst in the TCP socket. + # + # Consequently, sk_dst_check() fails, forcing the subsequent + # lookup to select the remaining healthy route via $veth_up. + ip -n "$server" link set "$veth_down" down + + tcpdump_start "$veth_up" "$server" + + # To capture enough packets. + sleep 3 + + tcpdump_stop "$veth_up" + + kill -9 "$client_pid" > /dev/null 2>&1 + kill -9 "$server_pid" > /dev/null 2>&1 + wait 2> /dev/null + + pkts=$(tcpdump_show $veth_up | wc -l) + + tcpdump_cleanup "$veth_up" + + if [ "$pkts" -lt 1000 ]; then + RET=$ksft_fail + fi +} + +test_ipv4() +{ + setup + tcp_ecmp_failover IPv4 $SERVER_IP $CLIENT_IP + log_test "TCP IPv4 failover" + cleanup +} + +test_ipv6() +{ + setup + tcp_ecmp_failover IPv6 "[$SERVER_IP6]" "[$CLIENT_IP6]" + log_test "TCP IPv6 failover" + cleanup +} + +require_command socat +require_command tcpdump + +trap cleanup EXIT + +test_ipv4 +test_ipv6 + +exit "$EXIT_STATUS" -- cgit v1.2.3 From ddca6da148b8ced3e6d3d7fb3b2e5b4ed6359dc2 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 27 Apr 2026 11:23:35 +0100 Subject: MAINTAINERS: Add self for the DEC LANCE network driver Like with the rest of DECstation and TURBOchannel hardware I have been handling the DEC LANCE network driver for some 25 years now anyway. Signed-off-by: Maciej W. Rozycki Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/alpine.DEB.2.21.2604271113520.28583@angie.orcam.me.uk Signed-off-by: Jakub Kicinski --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 27a073f53cea..ec8661b446fb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7077,6 +7077,12 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git core/debugobjec F: include/linux/debugobjects.h F: lib/debugobjects.c +DEC LANCE NETWORK DRIVER +M: "Maciej W. Rozycki" +L: netdev@vger.kernel.org +S: Maintained +F: drivers/net/ethernet/amd/declance.c + DECSTATION PLATFORM SUPPORT M: "Maciej W. Rozycki" L: linux-mips@vger.kernel.org -- cgit v1.2.3 From 1a57efe250a13906396c2a4792f0090f142f9844 Mon Sep 17 00:00:00 2001 From: Holger Brunck Date: Wed, 29 Apr 2026 13:42:07 +0200 Subject: net: wan: fsl_ucc_hdlc: fix uhdlc_memclean Unmapping of uf_regs is done from ucc_fast_free and doesn't need to be done explicitly. If already unmapped ucc_fast_free will crash. Fixes: c19b6d246a35 ("drivers/net: support hdlc function for QE-UCC") Signed-off-by: Holger Brunck Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- drivers/net/wan/fsl_ucc_hdlc.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c index 3bd57527b1be..8155e92af14e 100644 --- a/drivers/net/wan/fsl_ucc_hdlc.c +++ b/drivers/net/wan/fsl_ucc_hdlc.c @@ -773,11 +773,6 @@ static void uhdlc_memclean(struct ucc_hdlc_private *priv) kfree(priv->tx_skbuff); priv->tx_skbuff = NULL; - if (priv->uf_regs) { - iounmap(priv->uf_regs); - priv->uf_regs = NULL; - } - if (priv->uccf) { ucc_fast_free(priv->uccf); priv->uccf = NULL; -- cgit v1.2.3 From 851bba8068d15f5a386da544096f7ed6bc16e551 Mon Sep 17 00:00:00 2001 From: Holger Brunck Date: Wed, 29 Apr 2026 13:42:08 +0200 Subject: net: wan: fsl_ucc_hdlc: fix ucc_hdlc_remove If the driver is used in a non tdm mode priv->utdm is a NULL pointer. Therefore we need to check this pointer first before checking si_regs. Fixes: c19b6d246a35 ("drivers/net: support hdlc function for QE-UCC") Signed-off-by: Holger Brunck Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- drivers/net/wan/fsl_ucc_hdlc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c index 8155e92af14e..15bfb78381d4 100644 --- a/drivers/net/wan/fsl_ucc_hdlc.c +++ b/drivers/net/wan/fsl_ucc_hdlc.c @@ -1250,12 +1250,12 @@ static void ucc_hdlc_remove(struct platform_device *pdev) uhdlc_memclean(priv); - if (priv->utdm->si_regs) { + if (priv->utdm && priv->utdm->si_regs) { iounmap(priv->utdm->si_regs); priv->utdm->si_regs = NULL; } - if (priv->utdm->siram) { + if (priv->utdm && priv->utdm->siram) { iounmap(priv->utdm->siram); priv->utdm->siram = NULL; } -- cgit v1.2.3 From 383d0fb8946921b4914ea0f360342e221d419d40 Mon Sep 17 00:00:00 2001 From: Gregory Fuchedgi Date: Wed, 29 Apr 2026 14:54:14 -0700 Subject: amd-xgbe: fix PTP addend overflow causing frozen clock XGBE_PTP_ACT_CLK_FREQ and XGBE_V2_PTP_ACT_CLK_FREQ were 10x too large (500MHz/1GHz instead of 50MHz/100MHz), causing the computed addend to overflow the 32-bit tstamp_addend. In the general case this would result in the clock advancing at the wrong rate. For v2 (PCI), ptpclk_rate is hardcoded to 125MHz, so the addend formula (ACT_CLK_FREQ << 32) / ptpclk_rate yields exactly 8 * 2^32, and when stored to the 32-bit tstamp_addend the value is zero. With addend = 0 the hardware accumulator never overflows and the PTP clock is fully stopped. For v1 (platform), ptpclk_rate is read from ACPI/DT so the exact overflow behavior depends on the firmware-reported frequency. Define the constants as NSEC_PER_SEC / SSINC so the relationship is explicit and cannot drift out of sync. Fixes: fbd47be098b5 ("amd-xgbe: add hardware PTP timestamping support") Tested-by: Gregory Fuchedgi Signed-off-by: Gregory Fuchedgi Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260429-fix-xgbe-ptp-addend-v1-1-fca5b0ca5e62@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/xgbe/xgbe.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h index 60b7e53206d1..3d3b09010d48 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe.h @@ -135,11 +135,11 @@ */ #define XGBE_TSTAMP_SSINC 20 #define XGBE_TSTAMP_SNSINC 0 -#define XGBE_PTP_ACT_CLK_FREQ 500000000 +#define XGBE_PTP_ACT_CLK_FREQ (NSEC_PER_SEC / XGBE_TSTAMP_SSINC) #define XGBE_V2_TSTAMP_SSINC 0xA #define XGBE_V2_TSTAMP_SNSINC 0 -#define XGBE_V2_PTP_ACT_CLK_FREQ 1000000000 +#define XGBE_V2_PTP_ACT_CLK_FREQ (NSEC_PER_SEC / XGBE_V2_TSTAMP_SSINC) /* Define maximum supported values */ #define XGBE_MAX_PPS_OUT 4 -- cgit v1.2.3 From 458d5615272d3de535748342eb68ca492343048c Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Thu, 30 Apr 2026 11:29:55 -0400 Subject: net/sched: sch_red: Replace direct dequeue call with peek and qdisc_dequeue_peeked When red qdisc has children (eg qfq qdisc) whose peek() callback is qdisc_peek_dequeued(), we could get a kernel panic. When the parent of such qdiscs (eg illustrated in patch #3 as tbf) wants to retrieve an skb from its child (red in this case), it will do the following: 1a. do a peek() - and when sensing there's an skb the child can offer, then - the child in this case(red) calls its child's (qfq) peek. qfq does the right thing and will return the gso_skb queue packet. Note: if there wasnt a gso_skb entry then qfq will store it there. 1b. invoke a dequeue() on the child (red). And herein lies the problem. - red will call the child's dequeue() which will essentially just try to grab something of qfq's queue. [ 78.667668][ T363] KASAN: null-ptr-deref in range [0x0000000000000048-0x000000000000004f] [ 78.667927][ T363] CPU: 1 UID: 0 PID: 363 Comm: ping Not tainted 7.1.0-rc1-00033-g46f74a3f7d57-dirty #790 PREEMPT(full) [ 78.668263][ T363] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 78.668486][ T363] RIP: 0010:qfq_dequeue+0x446/0xc90 [sch_qfq] [ 78.668718][ T363] Code: 54 c0 e8 dd 90 00 f1 48 c7 c7 e0 03 54 c0 48 89 de e8 ce 90 00 f1 48 8d 7b 48 b8 ff ff 37 00 48 89 fa 48 c1 e0 2a 48 c1 ea 03 <80> 3c 02 00 74 05 e8 ef a1 e1 f1 48 8b 7b 48 48 8d 54 24 58 48 8d [ 78.669312][ T363] RSP: 0018:ffff88810de573e0 EFLAGS: 00010216 [ 78.669533][ T363] RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000000 [ 78.669790][ T363] RDX: 0000000000000009 RSI: 0000000000000004 RDI: 0000000000000048 [ 78.670044][ T363] RBP: ffff888110dc4000 R08: ffffffffb1b0885a R09: fffffbfff6ba9078 [ 78.670297][ T363] R10: 0000000000000003 R11: ffff888110e31c80 R12: 0000001880000000 [ 78.670560][ T363] R13: ffff888110dc4150 R14: ffff888110dc42b8 R15: 0000000000000200 [ 78.670814][ T363] FS: 00007f66a8f09c40(0000) GS:ffff888163428000(0000) knlGS:0000000000000000 [ 78.671110][ T363] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 78.671324][ T363] CR2: 000055db4c6a30a8 CR3: 000000010da67000 CR4: 0000000000750ef0 [ 78.671585][ T363] PKRU: 55555554 [ 78.671713][ T363] Call Trace: [ 78.671843][ T363] [ 78.671936][ T363] ? __pfx_qfq_dequeue+0x10/0x10 [sch_qfq] [ 78.672148][ T363] ? __pfx__printk+0x10/0x10 [ 78.672322][ T363] ? srso_alias_return_thunk+0x5/0xfbef5 [ 78.672496][ T363] ? lockdep_hardirqs_on_prepare+0xa8/0x1a0 [ 78.672706][ T363] ? srso_alias_return_thunk+0x5/0xfbef5 [ 78.672875][ T363] ? trace_hardirqs_on+0x19/0x1a0 [ 78.673047][ T363] red_dequeue+0x65/0x270 [sch_red] [ 78.673217][ T363] ? srso_alias_return_thunk+0x5/0xfbef5 [ 78.673385][ T363] tbf_dequeue.cold+0xb0/0x70c [sch_tbf] [ 78.673566][ T363] __qdisc_run+0x169/0x1900 The right thing to do in #1b is to grab the skb off gso_skb queue. This patchset fixes that issue by changing #1b to use qdisc_dequeue_peeked() method instead. Fixes: 77be155cba4e ("pkt_sched: Add peek emulation for non-work-conserving qdiscs.") Reported-by: Manas Reported-by: Rakshit Awasthi Signed-off-by: Jamal Hadi Salim Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260430152957.194015-2-jhs@mojatatu.com Signed-off-by: Jakub Kicinski --- net/sched/sch_red.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 432b8a3000a5..4d0e44a2e7c6 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -162,7 +162,7 @@ static struct sk_buff *red_dequeue(struct Qdisc *sch) struct red_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; - skb = child->dequeue(child); + skb = qdisc_dequeue_peeked(child); if (skb) { qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); -- cgit v1.2.3 From 1b9bc71153b01dbde8045b9edede4240f4f5520e Mon Sep 17 00:00:00 2001 From: Victor Nogueria Date: Thu, 30 Apr 2026 11:29:56 -0400 Subject: net/sched: sch_sfb: Replace direct dequeue call with peek and qdisc_dequeue_peeked When sfb has children (eg qfq qdisc) whose peek() callback is qdisc_peek_dequeued(), we could get a kernel panic. When the parent of such qdiscs (eg illustrated in patch #3 as tbf) wants to retrieve an skb from its child (sfb in this case), it will do the following: 1a. do a peek() - and when sensing there's an skb the child can offer, then - the child in this case(sfb) calls its child's (qfq) peek. qfq does the right thing and will return the gso_skb queue packet. Note: if there wasnt a gso_skb entry then qfq will store it there. 1b. invoke a dequeue() on the child (sfb). And herein lies the problem. - sfb will call the child's dequeue() which will essentially just try to grab something of qfq's queue. [ 127.594489][ T453] KASAN: null-ptr-deref in range [0x0000000000000048-0x000000000000004f] [ 127.594741][ T453] CPU: 2 UID: 0 PID: 453 Comm: ping Not tainted 7.1.0-rc1-00035-gac961974495b-dirty #793 PREEMPT(full) [ 127.595059][ T453] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 127.595254][ T453] RIP: 0010:qfq_dequeue+0x35c/0x1650 [sch_qfq] [ 127.595461][ T453] Code: 00 fc ff df 80 3c 02 00 0f 85 17 0e 00 00 4c 8d 73 48 48 89 9d b8 02 00 00 48 b8 00 00 00 00 00 fc ff df 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 76 0c 00 00 48 b8 00 00 00 00 00 fc ff df 4c 8b [ 127.596081][ T453] RSP: 0018:ffff88810e5af440 EFLAGS: 00010216 [ 127.596337][ T453] RAX: dffffc0000000000 RBX: 0000000000000000 RCX: dffffc0000000000 [ 127.596623][ T453] RDX: 0000000000000009 RSI: 0000001880000000 RDI: ffff888104fd82b0 [ 127.596917][ T453] RBP: ffff888104fd8000 R08: ffff888104fd8280 R09: 1ffff110211893a3 [ 127.597165][ T453] R10: 1ffff110211893a6 R11: 1ffff110211893a7 R12: 0000001880000000 [ 127.597404][ T453] R13: ffff888104fd82b8 R14: 0000000000000048 R15: 0000000040000000 [ 127.597644][ T453] FS: 00007fc380cbfc40(0000) GS:ffff88816f2a8000(0000) knlGS:0000000000000000 [ 127.597956][ T453] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 127.598160][ T453] CR2: 00005610aa9890a8 CR3: 000000010369e000 CR4: 0000000000750ef0 [ 127.598390][ T453] PKRU: 55555554 [ 127.598509][ T453] Call Trace: [ 127.598629][ T453] [ 127.598718][ T453] ? mark_held_locks+0x40/0x70 [ 127.598890][ T453] ? srso_alias_return_thunk+0x5/0xfbef5 [ 127.599053][ T453] sfb_dequeue+0x88/0x4d0 [ 127.599174][ T453] ? ktime_get+0x137/0x230 [ 127.599328][ T453] ? srso_alias_return_thunk+0x5/0xfbef5 [ 127.599480][ T453] ? qdisc_peek_dequeued+0x7b/0x350 [sch_qfq] [ 127.599670][ T453] ? srso_alias_return_thunk+0x5/0xfbef5 [ 127.599831][ T453] tbf_dequeue+0x6b1/0x1098 [sch_tbf] [ 127.599988][ T453] __qdisc_run+0x169/0x1900 The right thing to do in #1b is to grab the skb off gso_skb queue. This patchset fixes that issue by changing #1b to use qdisc_dequeue_peeked() method instead. Fixes: e13e02a3c68d ("net_sched: SFB flow scheduler") Signed-off-by: Victor Nogueria Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260430152957.194015-3-jhs@mojatatu.com Signed-off-by: Jakub Kicinski --- net/sched/sch_sfb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index bd5ef561030f..d3ee8e5479b3 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -441,7 +441,7 @@ static struct sk_buff *sfb_dequeue(struct Qdisc *sch) struct Qdisc *child = q->qdisc; struct sk_buff *skb; - skb = child->dequeue(q->qdisc); + skb = qdisc_dequeue_peeked(child); if (skb) { qdisc_bstats_update(sch, skb); -- cgit v1.2.3 From 3a3a30c14d7f04206916824a79878cfefac6c8e2 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Thu, 30 Apr 2026 11:29:57 -0400 Subject: selftests/tc-testing: Add tests that force red and sfb to dequeue from child's gso_skb Create 4 test cases: - Force red to dequeue from its child's gso_skb with qfq leaf - Force sfb to dequeue from its child's gso_skb with qfq leaf - Force red to dequeue from its child's gso_skb with dualpi2 leaf - Force sfb to dequeue from its child's gso_skb with dualpi2 leaf All of them have tbf followed by red (or sfb) followed by qfq (or dualpi2). Since tbf calls its child's peek followed by qdisc_dequeue_peeked, it will force red/sfb to call their child's peek. In this case, since the child (qfq/dualpi2) has qdisc_peek_dequeued as its peek callback, the packet will be stored in its gso_skb queue. During the subsequent call to qdisc_dequeue_peeked, red/sfb will have to dequeue from the child's gso_skb to retrieve the packet. Not doing so will cause a NULL ptr deref which was happening before a recent fix. Acked-by: Jamal Hadi Salim Signed-off-by: Victor Nogueira Link: https://patch.msgid.link/20260430152957.194015-4-jhs@mojatatu.com Signed-off-by: Jakub Kicinski --- .../tc-testing/tc-tests/infra/qdiscs.json | 148 +++++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index eefadd0546d3..b1f856cf62c1 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -1136,5 +1136,153 @@ "teardown": [ "$TC qdisc del dev $DUMMY handle 1: root" ] + }, + { + "id": "7a5f", + "name": "Force red to dequeue from its child's gso_skb with qfq leaf", + "category": [ + "qdisc", + "tbf", + "red", + "qfq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 88bit burst 1661b peakrate 2257333 minburst 1024 limit 7b", + "$TC qdisc add dev $DUMMY parent 1: handle 2: red limit 757 min 16 max 24 avpkt 16", + "$TC qdisc add dev $DUMMY parent 2: handle 3: qfq", + "$TC class add dev $DUMMY classid 3:1 parent 3: qfq maxpkt 512 weight 1", + "$TC filter add dev $DUMMY parent 3: protocol ip prio 1 matchall classid 3:1 action ok" + ], + "cmdUnderTest": "ping -c 1 10.10.10.1 -W0.01 -I$DUMMY || true", + "expExitCode": "0", + "verifyCmd": "$TC -s -j qdisc ls dev $DUMMY parent 1:", + "matchJSON": [ + { + "kind": "red", + "handle": "2:", + "bytes": 98, + "packets": 1, + "backlog": 0, + "qlen": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "cdae", + "name": "Force sfb to dequeue from its child's gso_skb with qfq leaf", + "category": [ + "qdisc", + "tbf", + "sfb", + "qfq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 88bit burst 1661b peakrate 2257333 minburst 1024 limit 7b", + "$TC qdisc add dev $DUMMY parent 1: handle 2: sfb", + "$TC qdisc add dev $DUMMY parent 2: handle 3: qfq", + "$TC class add dev $DUMMY classid 3:1 parent 3: qfq maxpkt 512 weight 1", + "$TC filter add dev $DUMMY parent 3: protocol ip prio 1 matchall classid 3:1 action ok" + ], + "cmdUnderTest": "ping -c 1 10.10.10.1 -W0.01 -I$DUMMY || true", + "expExitCode": "0", + "verifyCmd": "$TC -s -j qdisc ls dev $DUMMY parent 1:", + "matchJSON": [ + { + "kind": "sfb", + "handle": "2:", + "bytes": 98, + "packets": 1, + "backlog": 0, + "qlen": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "291d", + "name": "Force red to dequeue from its child's gso_skb with dualpi2 leaf", + "category": [ + "qdisc", + "tbf", + "red", + "dualpi2" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 88bit burst 1661b peakrate 2257333 minburst 1024 limit 7b", + "$TC qdisc add dev $DUMMY parent 1: handle 2: red limit 757 min 16 max 24 avpkt 16", + "$TC qdisc add dev $DUMMY parent 2: handle 3: dualpi2" + ], + "cmdUnderTest": "ping -c 1 10.10.10.1 -W0.01 -I$DUMMY || true", + "expExitCode": "0", + "verifyCmd": "$TC -s -j qdisc ls dev $DUMMY parent 1:", + "matchJSON": [ + { + "kind": "red", + "handle": "2:", + "bytes": 98, + "packets": 1, + "backlog": 0, + "qlen": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] + }, + { + "id": "9c6d", + "name": "Force sfb to dequeue from its child's gso_skb with dualpi2 leaf", + "category": [ + "qdisc", + "tbf", + "sfb", + "dualpi2" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.11.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY root handle 1: tbf rate 88bit burst 1661b peakrate 2257333 minburst 1024 limit 7b", + "$TC qdisc add dev $DUMMY parent 1: handle 2: sfb", + "$TC qdisc add dev $DUMMY parent 2: handle 3: dualpi2" + ], + "cmdUnderTest": "ping -c 1 10.10.10.1 -W0.01 -I$DUMMY || true", + "expExitCode": "0", + "verifyCmd": "$TC -s -j qdisc ls dev $DUMMY parent 1:", + "matchJSON": [ + { + "kind": "sfb", + "handle": "2:", + "bytes": 98, + "packets": 1, + "backlog": 0, + "qlen": 0 + } + ], + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1: root" + ] } ] -- cgit v1.2.3 From 1d324c2f43f70c965f25c58cc3611c779adbe47e Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Thu, 30 Apr 2026 18:33:18 +0800 Subject: ip6_gre: Use cached t->net in ip6erspan_changelink(). After commit 5e72ce3e3980 ("net: ipv6: Use link netns in newlink() of rtnl_link_ops"), ip6erspan_newlink() correctly resolves the per-netns ip6gre hash via link_net. ip6erspan_changelink() was not converted in that series and still uses dev_net(dev), which diverges from the device's creation netns after IFLA_NET_NS_FD migration. This re-inserts the tunnel into the wrong per-netns hash. The original netns keeps a stale entry. When that netns is later destroyed, ip6gre_exit_rtnl_net() walks the stale entry, producing a slab-use-after-free reported by KASAN, followed by a kernel BUG at net/core/dev.c (LIST_POISON1) in unregister_netdevice_many_notify(). Reachable from an unprivileged user namespace (unshare --user --map-root-user --net). ip6gre_changelink() earlier in the same file already uses the cached t->net; only ip6erspan_changelink() has the wrong shape. Fixes: 2d665034f239 ("net: ip6_gre: Fix ip6erspan hlen calculation") Cc: stable@vger.kernel.org # v5.15+ Signed-off-by: Maoyi Xie Reviewed-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260430103318.3206018-1-maoyi.xie@ntu.edu.sg Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_gre.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 63fc8556b475..365b4059eb20 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -2262,10 +2262,11 @@ static int ip6erspan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { - struct ip6gre_net *ign = net_generic(dev_net(dev), ip6gre_net_id); + struct ip6_tnl *t = netdev_priv(dev); struct __ip6_tnl_parm p; - struct ip6_tnl *t; + struct ip6gre_net *ign; + ign = net_generic(t->net, ip6gre_net_id); t = ip6gre_changelink_common(dev, tb, data, &p, extack); if (IS_ERR(t)) return PTR_ERR(t); -- cgit v1.2.3 From 046111a1a35a1720748f254377d3d1663664ea61 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2026 06:16:09 +0000 Subject: net/sched: sch_cake: annotate data-races in cake_dump_class_stats (I) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cake_dump_class_stats() runs without qdisc spinlock being held. In this first patch, I add READ_ONCE()/WRITE_ONCE() annotations for: - flow->head - flow->dropped - b->backlogs[] Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc") Signed-off-by: Eric Dumazet Acked-by: Toke Høiland-Jørgensen Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260430061610.3503483-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 13c6d1869a14..806eb73d6a05 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -914,7 +914,7 @@ static struct sk_buff *dequeue_head(struct cake_flow *flow) struct sk_buff *skb = flow->head; if (skb) { - flow->head = skb->next; + WRITE_ONCE(flow->head, skb->next); skb_mark_not_on_list(skb); } @@ -926,7 +926,7 @@ static struct sk_buff *dequeue_head(struct cake_flow *flow) static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) { if (!flow->head) - flow->head = skb; + WRITE_ONCE(flow->head, skb); else flow->tail->next = skb; flow->tail = skb; @@ -1357,7 +1357,7 @@ found: if (elig_ack_prev) elig_ack_prev->next = elig_ack->next; else - flow->head = elig_ack->next; + WRITE_ONCE(flow->head, elig_ack->next); skb_mark_not_on_list(elig_ack); @@ -1595,11 +1595,11 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) len = qdisc_pkt_len(skb); q->buffer_used -= skb->truesize; - b->backlogs[idx] -= len; WRITE_ONCE(b->tin_backlog, b->tin_backlog - len); + WRITE_ONCE(b->backlogs[idx], b->backlogs[idx] - len); sch->qstats.backlog -= len; - flow->dropped++; + WRITE_ONCE(flow->dropped, flow->dropped + 1); WRITE_ONCE(b->tin_dropped, b->tin_dropped + 1); if (q->config->rate_flags & CAKE_FLAG_INGRESS) @@ -1824,11 +1824,11 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, } /* stats */ - b->backlogs[idx] += slen; sch->qstats.backlog += slen; q->avg_window_bytes += slen; WRITE_ONCE(b->bytes, b->bytes + slen); WRITE_ONCE(b->tin_backlog, b->tin_backlog + slen); + WRITE_ONCE(b->backlogs[idx], b->backlogs[idx] + slen); qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen); consume_skb(skb); @@ -1861,11 +1861,11 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* stats */ WRITE_ONCE(b->packets, b->packets + 1); - b->backlogs[idx] += len - ack_pkt_len; sch->qstats.backlog += len - ack_pkt_len; q->avg_window_bytes += len - ack_pkt_len; WRITE_ONCE(b->bytes, b->bytes + len - ack_pkt_len); WRITE_ONCE(b->tin_backlog, b->tin_backlog + len - ack_pkt_len); + WRITE_ONCE(b->backlogs[idx], b->backlogs[idx] + len - ack_pkt_len); } if (q->overflow_timeout) @@ -1977,7 +1977,7 @@ static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) if (flow->head) { skb = dequeue_head(flow); len = qdisc_pkt_len(skb); - b->backlogs[q->cur_flow] -= len; + WRITE_ONCE(b->backlogs[q->cur_flow], b->backlogs[q->cur_flow] - len); WRITE_ONCE(b->tin_backlog, b->tin_backlog - len); sch->qstats.backlog -= len; q->buffer_used -= skb->truesize; @@ -2235,7 +2235,7 @@ retry: flow->deficit -= len; b->tin_deficit -= len; } - flow->dropped++; + WRITE_ONCE(flow->dropped, flow->dropped + 1); WRITE_ONCE(b->tin_dropped, b->tin_dropped + 1); qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_qstats_drop(sch); @@ -3137,7 +3137,7 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, flow = &b->flows[idx % CAKE_QUEUES]; - if (flow->head) { + if (READ_ONCE(flow->head)) { sch_tree_lock(sch); skb = flow->head; while (skb) { @@ -3146,8 +3146,8 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, } sch_tree_unlock(sch); } - qs.backlog = b->backlogs[idx % CAKE_QUEUES]; - qs.drops = flow->dropped; + qs.backlog = READ_ONCE(b->backlogs[idx % CAKE_QUEUES]); + qs.drops = READ_ONCE(flow->dropped); } if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) return -1; -- cgit v1.2.3 From 67dc6c56b871617deac85b9f72500b69b1fdf835 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2026 06:16:10 +0000 Subject: net/sched: sch_cake: annotate data-races in cake_dump_class_stats (II) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cake_dump_class_stats() runs without qdisc spinlock being held. In this second patch, I add READ_ONCE()/WRITE_ONCE() annotations for: - flow->deficit - flow->cvars.dropping - flow->cvars.count - flow->cvars.p_drop - flow->cvars.blue_timer - flow->cvars.drop_next Fixes: 046f6fd5daef ("sched: Add Common Applications Kept Enhanced (cake) qdisc") Signed-off-by: Eric Dumazet Acked-by: Toke Høiland-Jørgensen Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260430061610.3503483-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 131 ++++++++++++++++++++++++++++----------------------- 1 file changed, 71 insertions(+), 60 deletions(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 806eb73d6a05..5862933be8d7 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -399,14 +399,14 @@ static void cake_configure_rates(struct Qdisc *sch, u64 rate, bool rate_adjust); * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 */ -static void cobalt_newton_step(struct cobalt_vars *vars) +static void cobalt_newton_step(struct cobalt_vars *vars, u32 count) { u32 invsqrt, invsqrt2; u64 val; invsqrt = vars->rec_inv_sqrt; invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; - val = (3LL << 32) - ((u64)vars->count * invsqrt2); + val = (3LL << 32) - ((u64)count * invsqrt2); val >>= 2; /* avoid overflow in following multiply */ val = (val * invsqrt) >> (32 - 2 + 1); @@ -414,12 +414,12 @@ static void cobalt_newton_step(struct cobalt_vars *vars) vars->rec_inv_sqrt = val; } -static void cobalt_invsqrt(struct cobalt_vars *vars) +static void cobalt_invsqrt(struct cobalt_vars *vars, u32 count) { - if (vars->count < REC_INV_SQRT_CACHE) - vars->rec_inv_sqrt = inv_sqrt_cache[vars->count]; + if (count < REC_INV_SQRT_CACHE) + vars->rec_inv_sqrt = inv_sqrt_cache[count]; else - cobalt_newton_step(vars); + cobalt_newton_step(vars, count); } static void cobalt_vars_init(struct cobalt_vars *vars) @@ -449,16 +449,19 @@ static bool cobalt_queue_full(struct cobalt_vars *vars, bool up = false; if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { - up = !vars->p_drop; - vars->p_drop += p->p_inc; - if (vars->p_drop < p->p_inc) - vars->p_drop = ~0; - vars->blue_timer = now; - } - vars->dropping = true; - vars->drop_next = now; + u32 p_drop = vars->p_drop; + + up = !p_drop; + p_drop += p->p_inc; + if (p_drop < p->p_inc) + p_drop = ~0; + WRITE_ONCE(vars->p_drop, p_drop); + WRITE_ONCE(vars->blue_timer, now); + } + WRITE_ONCE(vars->dropping, true); + WRITE_ONCE(vars->drop_next, now); if (!vars->count) - vars->count = 1; + WRITE_ONCE(vars->count, 1); return up; } @@ -475,20 +478,20 @@ static bool cobalt_queue_empty(struct cobalt_vars *vars, if (vars->p_drop && ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { if (vars->p_drop < p->p_dec) - vars->p_drop = 0; + WRITE_ONCE(vars->p_drop, 0); else - vars->p_drop -= p->p_dec; - vars->blue_timer = now; + WRITE_ONCE(vars->p_drop, vars->p_drop - p->p_dec); + WRITE_ONCE(vars->blue_timer, now); down = !vars->p_drop; } - vars->dropping = false; + WRITE_ONCE(vars->dropping, false); if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) { - vars->count--; - cobalt_invsqrt(vars); - vars->drop_next = cobalt_control(vars->drop_next, - p->interval, - vars->rec_inv_sqrt); + WRITE_ONCE(vars->count, vars->count - 1); + cobalt_invsqrt(vars, vars->count); + WRITE_ONCE(vars->drop_next, + cobalt_control(vars->drop_next, p->interval, + vars->rec_inv_sqrt)); } return down; @@ -507,6 +510,7 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars, bool next_due, over_target; ktime_t schedule; u64 sojourn; + u32 count; /* The 'schedule' variable records, in its sign, whether 'now' is before or * after 'drop_next'. This allows 'drop_next' to be updated before the next @@ -528,21 +532,22 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars, over_target = sojourn > p->target && sojourn > p->mtu_time * bulk_flows * 2 && sojourn > p->mtu_time * 4; - next_due = vars->count && ktime_to_ns(schedule) >= 0; + count = vars->count; + next_due = count && ktime_to_ns(schedule) >= 0; vars->ecn_marked = false; if (over_target) { if (!vars->dropping) { - vars->dropping = true; - vars->drop_next = cobalt_control(now, - p->interval, - vars->rec_inv_sqrt); + WRITE_ONCE(vars->dropping, true); + WRITE_ONCE(vars->drop_next, + cobalt_control(now, p->interval, + vars->rec_inv_sqrt)); } - if (!vars->count) - vars->count = 1; + if (!count) + count = 1; } else if (vars->dropping) { - vars->dropping = false; + WRITE_ONCE(vars->dropping, false); } if (next_due && vars->dropping) { @@ -550,23 +555,23 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars, if (!(vars->ecn_marked = INET_ECN_set_ce(skb))) reason = QDISC_DROP_CONGESTED; - vars->count++; - if (!vars->count) - vars->count--; - cobalt_invsqrt(vars); - vars->drop_next = cobalt_control(vars->drop_next, - p->interval, - vars->rec_inv_sqrt); + count++; + if (!count) + count--; + cobalt_invsqrt(vars, count); + WRITE_ONCE(vars->drop_next, + cobalt_control(vars->drop_next, p->interval, + vars->rec_inv_sqrt)); schedule = ktime_sub(now, vars->drop_next); } else { while (next_due) { - vars->count--; - cobalt_invsqrt(vars); - vars->drop_next = cobalt_control(vars->drop_next, - p->interval, - vars->rec_inv_sqrt); + count--; + cobalt_invsqrt(vars, count); + WRITE_ONCE(vars->drop_next, + cobalt_control(vars->drop_next, p->interval, + vars->rec_inv_sqrt)); schedule = ktime_sub(now, vars->drop_next); - next_due = vars->count && ktime_to_ns(schedule) >= 0; + next_due = count && ktime_to_ns(schedule) >= 0; } } @@ -575,11 +580,12 @@ static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars, get_random_u32() < vars->p_drop) reason = QDISC_DROP_FLOOD_PROTECTION; + WRITE_ONCE(vars->count, count); /* Overload the drop_next field as an activity timeout */ - if (!vars->count) - vars->drop_next = ktime_add_ns(now, p->interval); + if (!count) + WRITE_ONCE(vars->drop_next, ktime_add_ns(now, p->interval)); else if (ktime_to_ns(schedule) > 0 && reason == QDISC_DROP_UNSPEC) - vars->drop_next = now; + WRITE_ONCE(vars->drop_next, now); return reason; } @@ -1924,7 +1930,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, flow->set = CAKE_SET_SPARSE; WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count + 1); - flow->deficit = cake_get_flow_quantum(b, flow, q->config->flow_mode); + WRITE_ONCE(flow->deficit, cake_get_flow_quantum(b, flow, q->config->flow_mode)); } else if (flow->set == CAKE_SET_SPARSE_WAIT) { /* this flow was empty, accounted as a sparse flow, but actually * in the bulk rotation. @@ -2166,7 +2172,8 @@ retry: } } - flow->deficit += cake_get_flow_quantum(b, flow, q->config->flow_mode); + WRITE_ONCE(flow->deficit, + flow->deficit + cake_get_flow_quantum(b, flow, q->config->flow_mode)); list_move_tail(&flow->flowchain, &b->old_flows); goto retry; @@ -2232,7 +2239,7 @@ retry: if (q->config->rate_flags & CAKE_FLAG_INGRESS) { len = cake_advance_shaper(q, b, skb, now, true); - flow->deficit -= len; + WRITE_ONCE(flow->deficit, flow->deficit - len); b->tin_deficit -= len; } WRITE_ONCE(flow->dropped, flow->dropped + 1); @@ -2259,7 +2266,7 @@ retry: delay < b->base_delay ? 2 : 8)); len = cake_advance_shaper(q, b, skb, now, false); - flow->deficit -= len; + WRITE_ONCE(flow->deficit, flow->deficit - len); b->tin_deficit -= len; if (ktime_after(q->time_next_packet, now) && sch->q.qlen) { @@ -3153,6 +3160,8 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, return -1; if (flow) { ktime_t now = ktime_get(); + bool dropping; + u32 p_drop; stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); if (!stats) @@ -3167,21 +3176,23 @@ static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, goto nla_put_failure; \ } while (0) - PUT_STAT_S32(DEFICIT, flow->deficit); - PUT_STAT_U32(DROPPING, flow->cvars.dropping); - PUT_STAT_U32(COBALT_COUNT, flow->cvars.count); - PUT_STAT_U32(P_DROP, flow->cvars.p_drop); - if (flow->cvars.p_drop) { + PUT_STAT_S32(DEFICIT, READ_ONCE(flow->deficit)); + dropping = READ_ONCE(flow->cvars.dropping); + PUT_STAT_U32(DROPPING, dropping); + PUT_STAT_U32(COBALT_COUNT, READ_ONCE(flow->cvars.count)); + p_drop = READ_ONCE(flow->cvars.p_drop); + PUT_STAT_U32(P_DROP, p_drop); + if (p_drop) { PUT_STAT_S32(BLUE_TIMER_US, ktime_to_us( ktime_sub(now, - flow->cvars.blue_timer))); + READ_ONCE(flow->cvars.blue_timer)))); } - if (flow->cvars.dropping) { + if (dropping) { PUT_STAT_S32(DROP_NEXT_US, ktime_to_us( ktime_sub(now, - flow->cvars.drop_next))); + READ_ONCE(flow->cvars.drop_next)))); } if (nla_nest_end(d->skb, stats) < 0) -- cgit v1.2.3 From 7e7be31bfdb066c1c780dcd6b1224078fc54063f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 29 Apr 2026 15:29:38 -0700 Subject: net: tls: fix silent data drop under pipe back-pressure tls_sw_splice_read() uses len when advancing rxm->offset / rxm->full_len after skb_splice_bits(), rather than copied (the actual number of bytes successfully spliced into the pipe). When the destination pipe cannot accept all the requested bytes, splice_to_pipe() returns fewer bytes than len, and 'len - copied' of data is effectively skipped over. Fixes: e062fe99cccd ("tls: splice_read: fix accessing pre-processed records") Link: https://patch.msgid.link/20260429222944.2139041-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 798243eabb1f..2590e855f6a5 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2317,9 +2317,9 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, if (copied < 0) goto splice_requeue; - if (chunk < rxm->full_len) { - rxm->offset += len; - rxm->full_len -= len; + if (copied < rxm->full_len) { + rxm->offset += copied; + rxm->full_len -= copied; goto splice_requeue; } -- cgit v1.2.3 From bd3a4795d5744f59a1f485379f1303e5e606f377 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 29 Apr 2026 15:29:39 -0700 Subject: selftests: tls: add test for data loss on small pipe Add selftest for data loss on short splice. Link: https://patch.msgid.link/20260429222944.2139041-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/tls.c | 43 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index 9e2ccea13d70..30a236b8e9f7 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -946,6 +946,49 @@ TEST_F(tls, peek_and_splice) EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0); } +TEST_F(tls, splice_to_pipe_small) +{ + int send_len = TLS_PAYLOAD_MAX_LEN; + char mem_send[TLS_PAYLOAD_MAX_LEN]; + char mem_recv[TLS_PAYLOAD_MAX_LEN]; + size_t total = 0; + int p[2]; + + memrnd(mem_send, sizeof(mem_send)); + + ASSERT_GE(pipe(p), 0); + + /* Shrink pipe to 1 page (typically 4096 bytes) to force multiple + * splice iterations for a 16384-byte TLS record. + */ + EXPECT_GE(fcntl(p[1], F_SETPIPE_SZ, 4096), 4096); + + EXPECT_EQ(send(self->fd, mem_send, send_len, 0), send_len); + + while (total < (size_t)send_len) { + ssize_t spliced, drained; + + spliced = splice(self->cfd, NULL, p[1], NULL, + send_len - total, 0); + EXPECT_GT(spliced, 0); + if (spliced <= 0) + break; + + drained = read(p[0], mem_recv + total, spliced); + EXPECT_EQ(drained, spliced); + if (drained <= 0) + break; + + total += drained; + } + + EXPECT_EQ(total, (size_t)send_len); + EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0); + + close(p[0]); + close(p[1]); +} + #define MAX_FRAGS 48 TEST_F(tls, splice_short) { -- cgit v1.2.3 From 0cfff13c94cb5fa818bb374945ff280e08dc1bb9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 4 May 2026 08:54:27 +0200 Subject: wifi: mac80211: tests: mark HT check strict The HT check now only applies in strict mode since APs were found to be broken. Mark it as such. Fixes: 711a9c018ad2 ("wifi: mac80211: skip ieee80211_verify_sta_ht_mcs_support check in non-strict mode") Signed-off-by: Johannes Berg --- net/mac80211/tests/chan-mode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mac80211/tests/chan-mode.c b/net/mac80211/tests/chan-mode.c index adc069065e73..fa370831d617 100644 --- a/net/mac80211/tests/chan-mode.c +++ b/net/mac80211/tests/chan-mode.c @@ -65,6 +65,7 @@ static const struct determine_chan_mode_case { .ht_capa_mask = { .mcs.rx_mask[0] = 0xf7, }, + .strict = true, }, { .desc = "Masking out a RX rate in VHT capabilities", .conn_mode = IEEE80211_CONN_MODE_EHT, -- cgit v1.2.3 From 65493f27a6008bf84bd11bd41c5e1ea6b0bf3c3d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 30 Apr 2026 10:44:15 -0700 Subject: wifi: cw1200: Revert "Fix locking in error paths" Revert commit d98c24617a83 ("wifi: cw1200: Fix locking in error paths") because it introduces a locking bug instead of fixing a locking bug. cw1200_wow_resume() unlocks priv->conf_mutex. Hence, adding mutex_unlock(&priv->conf_mutex) just after cw1200_wow_resume() is wrong. Reported-by: Ben Hutchings Closes: https://lore.kernel.org/all/408661f69f263266b028713e1412ba36d457e63d.camel@decadent.org.uk/ Fixes: d98c24617a83 ("wifi: cw1200: Fix locking in error paths") Signed-off-by: Bart Van Assche Link: https://patch.msgid.link/20260430174418.1845431-1-bvanassche@acm.org Signed-off-by: Johannes Berg --- drivers/net/wireless/st/cw1200/pm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/wireless/st/cw1200/pm.c b/drivers/net/wireless/st/cw1200/pm.c index 84eb15d729c7..120f0379f81d 100644 --- a/drivers/net/wireless/st/cw1200/pm.c +++ b/drivers/net/wireless/st/cw1200/pm.c @@ -264,14 +264,12 @@ int cw1200_wow_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) wiphy_err(priv->hw->wiphy, "PM request failed: %d. WoW is disabled.\n", ret); cw1200_wow_resume(hw); - mutex_unlock(&priv->conf_mutex); return -EBUSY; } /* Force resume if event is coming from the device. */ if (atomic_read(&priv->bh_rx)) { cw1200_wow_resume(hw); - mutex_unlock(&priv->conf_mutex); return -EAGAIN; } -- cgit v1.2.3 From 05c5078de822148e7cb84968a8783ddfcb6c9ef1 Mon Sep 17 00:00:00 2001 From: Nicolas Escande Date: Wed, 22 Apr 2026 18:32:58 +0200 Subject: wifi: ath12k: fix leak in some ath12k_wmi_xxx() functions Some wmi functions were using plain 'return ath12k_wmi_cmd_send(...)' without explicitly handling the error code. This leads to leaking the skb in case of error. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.3.1-00218-QCAHKSWPL_SILICONZ-1 Fixes: 66a9448b1b89 ("wifi: ath12k: implement hardware data filter") Fixes: 593174170919 ("wifi: ath12k: implement WoW enable and wakeup commands") Fixes: 4a3c212eee0e ("wifi: ath12k: add basic WoW functionalities") Fixes: 16f474d6d49d ("wifi: ath12k: add WoW net-detect functionality") Fixes: 1666108c74c4 ("wifi: ath12k: support ARP and NS offload") Fixes: aab4ae566fa1 ("wifi: ath12k: support GTK rekey offload") Fixes: 7af01e569529 ("wifi: ath12k: handle keepalive during WoWLAN suspend and resume") Signed-off-by: Nicolas Escande Reviewed-by: Baochen Qiang Reviewed-by: Vasanthakumar Thiagarajan Link: https://patch.msgid.link/20260422163258.3013872-1-nico.escande@gmail.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/wmi.c | 103 +++++++++++++++++++++++++++++----- 1 file changed, 88 insertions(+), 15 deletions(-) diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index 65a05a9520ff..75c87edd2a8a 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -10251,7 +10251,7 @@ int ath12k_wmi_hw_data_filter_cmd(struct ath12k *ar, struct wmi_hw_data_filter_a { struct wmi_hw_data_filter_cmd *cmd; struct sk_buff *skb; - int len; + int ret, len; len = sizeof(*cmd); skb = ath12k_wmi_alloc_skb(ar->wmi->wmi_ab, len); @@ -10275,7 +10275,13 @@ int ath12k_wmi_hw_data_filter_cmd(struct ath12k *ar, struct wmi_hw_data_filter_a "wmi hw data filter enable %d filter_bitmap 0x%x\n", arg->enable, arg->hw_filter_bitmap); - return ath12k_wmi_cmd_send(ar->wmi, skb, WMI_HW_DATA_FILTER_CMDID); + ret = ath12k_wmi_cmd_send(ar->wmi, skb, WMI_HW_DATA_FILTER_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_HW_DATA_FILTER_CMDID\n"); + dev_kfree_skb(skb); + } + + return ret; } int ath12k_wmi_wow_host_wakeup_ind(struct ath12k *ar) @@ -10283,6 +10289,7 @@ int ath12k_wmi_wow_host_wakeup_ind(struct ath12k *ar) struct wmi_wow_host_wakeup_cmd *cmd; struct sk_buff *skb; size_t len; + int ret; len = sizeof(*cmd); skb = ath12k_wmi_alloc_skb(ar->wmi->wmi_ab, len); @@ -10295,14 +10302,20 @@ int ath12k_wmi_wow_host_wakeup_ind(struct ath12k *ar) ath12k_dbg(ar->ab, ATH12K_DBG_WMI, "wmi tlv wow host wakeup ind\n"); - return ath12k_wmi_cmd_send(ar->wmi, skb, WMI_WOW_HOSTWAKEUP_FROM_SLEEP_CMDID); + ret = ath12k_wmi_cmd_send(ar->wmi, skb, WMI_WOW_HOSTWAKEUP_FROM_SLEEP_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_WOW_HOSTWAKEUP_FROM_SLEEP_CMDID\n"); + dev_kfree_skb(skb); + } + + return ret; } int ath12k_wmi_wow_enable(struct ath12k *ar) { struct wmi_wow_enable_cmd *cmd; struct sk_buff *skb; - int len; + int ret, len; len = sizeof(*cmd); skb = ath12k_wmi_alloc_skb(ar->wmi->wmi_ab, len); @@ -10317,7 +10330,13 @@ int ath12k_wmi_wow_enable(struct ath12k *ar) cmd->pause_iface_config = cpu_to_le32(WOW_IFACE_PAUSE_ENABLED); ath12k_dbg(ar->ab, ATH12K_DBG_WMI, "wmi tlv wow enable\n"); - return ath12k_wmi_cmd_send(ar->wmi, skb, WMI_WOW_ENABLE_CMDID); + ret = ath12k_wmi_cmd_send(ar->wmi, skb, WMI_WOW_ENABLE_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_WOW_ENABLE_CMDID\n"); + dev_kfree_skb(skb); + } + + return ret; } int ath12k_wmi_wow_add_wakeup_event(struct ath12k *ar, u32 vdev_id, @@ -10327,6 +10346,7 @@ int ath12k_wmi_wow_add_wakeup_event(struct ath12k *ar, u32 vdev_id, struct wmi_wow_add_del_event_cmd *cmd; struct sk_buff *skb; size_t len; + int ret; len = sizeof(*cmd); skb = ath12k_wmi_alloc_skb(ar->wmi->wmi_ab, len); @@ -10343,7 +10363,13 @@ int ath12k_wmi_wow_add_wakeup_event(struct ath12k *ar, u32 vdev_id, ath12k_dbg(ar->ab, ATH12K_DBG_WMI, "wmi tlv wow add wakeup event %s enable %d vdev_id %d\n", wow_wakeup_event(event), enable, vdev_id); - return ath12k_wmi_cmd_send(ar->wmi, skb, WMI_WOW_ENABLE_DISABLE_WAKE_EVENT_CMDID); + ret = ath12k_wmi_cmd_send(ar->wmi, skb, WMI_WOW_ENABLE_DISABLE_WAKE_EVENT_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_WOW_ENABLE_DISABLE_WAKE_EVENT_CMDID\n"); + dev_kfree_skb(skb); + } + + return ret; } int ath12k_wmi_wow_add_pattern(struct ath12k *ar, u32 vdev_id, u32 pattern_id, @@ -10356,6 +10382,7 @@ int ath12k_wmi_wow_add_pattern(struct ath12k *ar, u32 vdev_id, u32 pattern_id, struct sk_buff *skb; void *ptr; size_t len; + int ret; len = sizeof(*cmd) + sizeof(*tlv) + /* array struct */ @@ -10435,7 +10462,13 @@ int ath12k_wmi_wow_add_pattern(struct ath12k *ar, u32 vdev_id, u32 pattern_id, ath12k_dbg_dump(ar->ab, ATH12K_DBG_WMI, NULL, "wow bitmask: ", bitmap->bitmaskbuf, pattern_len); - return ath12k_wmi_cmd_send(ar->wmi, skb, WMI_WOW_ADD_WAKE_PATTERN_CMDID); + ret = ath12k_wmi_cmd_send(ar->wmi, skb, WMI_WOW_ADD_WAKE_PATTERN_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_WOW_ADD_WAKE_PATTERN_CMDID\n"); + dev_kfree_skb(skb); + } + + return ret; } int ath12k_wmi_wow_del_pattern(struct ath12k *ar, u32 vdev_id, u32 pattern_id) @@ -10443,6 +10476,7 @@ int ath12k_wmi_wow_del_pattern(struct ath12k *ar, u32 vdev_id, u32 pattern_id) struct wmi_wow_del_pattern_cmd *cmd; struct sk_buff *skb; size_t len; + int ret; len = sizeof(*cmd); skb = ath12k_wmi_alloc_skb(ar->wmi->wmi_ab, len); @@ -10459,7 +10493,13 @@ int ath12k_wmi_wow_del_pattern(struct ath12k *ar, u32 vdev_id, u32 pattern_id) ath12k_dbg(ar->ab, ATH12K_DBG_WMI, "wmi tlv wow del pattern vdev_id %d pattern_id %d\n", vdev_id, pattern_id); - return ath12k_wmi_cmd_send(ar->wmi, skb, WMI_WOW_DEL_WAKE_PATTERN_CMDID); + ret = ath12k_wmi_cmd_send(ar->wmi, skb, WMI_WOW_DEL_WAKE_PATTERN_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_WOW_DEL_WAKE_PATTERN_CMDID\n"); + dev_kfree_skb(skb); + } + + return ret; } static struct sk_buff * @@ -10595,6 +10635,7 @@ int ath12k_wmi_wow_config_pno(struct ath12k *ar, u32 vdev_id, struct wmi_pno_scan_req_arg *pno_scan) { struct sk_buff *skb; + int ret; if (pno_scan->enable) skb = ath12k_wmi_op_gen_config_pno_start(ar, vdev_id, pno_scan); @@ -10604,7 +10645,13 @@ int ath12k_wmi_wow_config_pno(struct ath12k *ar, u32 vdev_id, if (IS_ERR_OR_NULL(skb)) return -ENOMEM; - return ath12k_wmi_cmd_send(ar->wmi, skb, WMI_NETWORK_LIST_OFFLOAD_CONFIG_CMDID); + ret = ath12k_wmi_cmd_send(ar->wmi, skb, WMI_NETWORK_LIST_OFFLOAD_CONFIG_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_NETWORK_LIST_OFFLOAD_CONFIG_CMDID\n"); + dev_kfree_skb(skb); + } + + return ret; } static void ath12k_wmi_fill_ns_offload(struct ath12k *ar, @@ -10717,6 +10764,7 @@ int ath12k_wmi_arp_ns_offload(struct ath12k *ar, void *buf_ptr; size_t len; u8 ns_cnt, ns_ext_tuples = 0; + int ret; ns_cnt = offload->ipv6_count; @@ -10752,7 +10800,13 @@ int ath12k_wmi_arp_ns_offload(struct ath12k *ar, if (ns_ext_tuples) ath12k_wmi_fill_ns_offload(ar, offload, &buf_ptr, enable, 1); - return ath12k_wmi_cmd_send(ar->wmi, skb, WMI_SET_ARP_NS_OFFLOAD_CMDID); + ret = ath12k_wmi_cmd_send(ar->wmi, skb, WMI_SET_ARP_NS_OFFLOAD_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_SET_ARP_NS_OFFLOAD_CMDID\n"); + dev_kfree_skb(skb); + } + + return ret; } int ath12k_wmi_gtk_rekey_offload(struct ath12k *ar, @@ -10762,7 +10816,7 @@ int ath12k_wmi_gtk_rekey_offload(struct ath12k *ar, struct wmi_gtk_rekey_offload_cmd *cmd; struct sk_buff *skb; __le64 replay_ctr; - int len; + int ret, len; len = sizeof(*cmd); skb = ath12k_wmi_alloc_skb(ar->wmi->wmi_ab, len); @@ -10789,7 +10843,13 @@ int ath12k_wmi_gtk_rekey_offload(struct ath12k *ar, ath12k_dbg(ar->ab, ATH12K_DBG_WMI, "offload gtk rekey vdev: %d %d\n", arvif->vdev_id, enable); - return ath12k_wmi_cmd_send(ar->wmi, skb, WMI_GTK_OFFLOAD_CMDID); + ret = ath12k_wmi_cmd_send(ar->wmi, skb, WMI_GTK_OFFLOAD_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_GTK_OFFLOAD_CMDID offload\n"); + dev_kfree_skb(skb); + } + + return ret; } int ath12k_wmi_gtk_rekey_getinfo(struct ath12k *ar, @@ -10797,7 +10857,7 @@ int ath12k_wmi_gtk_rekey_getinfo(struct ath12k *ar, { struct wmi_gtk_rekey_offload_cmd *cmd; struct sk_buff *skb; - int len; + int ret, len; len = sizeof(*cmd); skb = ath12k_wmi_alloc_skb(ar->wmi->wmi_ab, len); @@ -10811,7 +10871,13 @@ int ath12k_wmi_gtk_rekey_getinfo(struct ath12k *ar, ath12k_dbg(ar->ab, ATH12K_DBG_WMI, "get gtk rekey vdev_id: %d\n", arvif->vdev_id); - return ath12k_wmi_cmd_send(ar->wmi, skb, WMI_GTK_OFFLOAD_CMDID); + ret = ath12k_wmi_cmd_send(ar->wmi, skb, WMI_GTK_OFFLOAD_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_GTK_OFFLOAD_CMDID getinfo\n"); + dev_kfree_skb(skb); + } + + return ret; } int ath12k_wmi_sta_keepalive(struct ath12k *ar, @@ -10822,6 +10888,7 @@ int ath12k_wmi_sta_keepalive(struct ath12k *ar, struct wmi_sta_keepalive_cmd *cmd; struct sk_buff *skb; size_t len; + int ret; len = sizeof(*cmd) + sizeof(*arp); skb = ath12k_wmi_alloc_skb(wmi->wmi_ab, len); @@ -10849,7 +10916,13 @@ int ath12k_wmi_sta_keepalive(struct ath12k *ar, "wmi sta keepalive vdev %d enabled %d method %d interval %d\n", arg->vdev_id, arg->enabled, arg->method, arg->interval); - return ath12k_wmi_cmd_send(wmi, skb, WMI_STA_KEEPALIVE_CMDID); + ret = ath12k_wmi_cmd_send(wmi, skb, WMI_STA_KEEPALIVE_CMDID); + if (ret) { + ath12k_warn(ar->ab, "failed to send WMI_STA_KEEPALIVE_CMDID\n"); + dev_kfree_skb(skb); + } + + return ret; } int ath12k_wmi_mlo_setup(struct ath12k *ar, struct wmi_mlo_setup_arg *mlo_params) -- cgit v1.2.3 From 81594a12d5cecb3ab35b603a00037c7c3ee87ab2 Mon Sep 17 00:00:00 2001 From: Rameshkumar Sundaram Date: Mon, 27 Apr 2026 16:00:11 +0530 Subject: wifi: ath12k: initialize RSSI dBm conversion event state Currently, the RSSI dBm conversion event handler leaves struct ath12k_wmi_rssi_dbm_conv_info_arg uninitialized on the stack before calling the TLV parser. If one of the optional sub-TLVs is absent, the corresponding *_present flag retains stack garbage and later gets read in ath12k_wmi_update_rssi_offsets(). With UBSAN enabled this triggers an invalid-load report for _Bool: UBSAN: invalid-load in drivers/net/wireless/ath/ath12k/wmi.c:9682:15 load of value 9 is not a valid value for type '_Bool' Call Trace: ath12k_wmi_rssi_dbm_conversion_params_info_event.cold+0x72/0x85 [ath12k] ath12k_wmi_op_rx+0x1871/0x2ab0 [ath12k] ath12k_htc_rx_completion_handler+0x44b/0x810 [ath12k] ath12k_ce_recv_process_cb+0x554/0x9f0 [ath12k] ath12k_ce_per_engine_service+0xbe/0xf0 [ath12k] ath12k_pci_ce_workqueue+0x69/0x120 [ath12k] Initialize the parsed event state to zero before passing it to the TLV parser so missing sub-TLVs correctly leave the presence flags false. Tested-on: QCN9274 hw2.0 PCI WLAN.WBE.1.4.1-00199-QCAHKSWPL_SILICONZ-1 Fixes: 0314ee81a91d ("wifi: ath12k: handle WMI event for real noise floor calculation") Signed-off-by: Rameshkumar Sundaram Reviewed-by: Baochen Qiang Link: https://patch.msgid.link/20260427103011.2983269-1-rameshkumar.sundaram@oss.qualcomm.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/wmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath12k/wmi.c b/drivers/net/wireless/ath/ath12k/wmi.c index 75c87edd2a8a..b5e904a55aea 100644 --- a/drivers/net/wireless/ath/ath12k/wmi.c +++ b/drivers/net/wireless/ath/ath12k/wmi.c @@ -9778,7 +9778,7 @@ static void ath12k_wmi_rssi_dbm_conversion_params_info_event(struct ath12k_base *ab, struct sk_buff *skb) { - struct ath12k_wmi_rssi_dbm_conv_info_arg rssi_info; + struct ath12k_wmi_rssi_dbm_conv_info_arg rssi_info = {}; struct ath12k *ar; s32 noise_floor; u32 pdev_id; -- cgit v1.2.3 From 0e1308803d2c3fd365a6d21e6be355ec1e28eaaf Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Mon, 27 Apr 2026 13:51:41 +0800 Subject: wifi: ath12k: fix peer_id usage in normal RX path ath12k_dp_rx_deliver_msdu() currently uses hal_rx_desc_data::peer_id parsed from mpdu_start descriptor to do peer lookup. However In an A-MSDU aggregation scenario, hardware only populates mpdu_start descriptor for the first sub-msdu, but not the following ones. In that case peer_id could be invalid, leading to peer lookup failure: ath12k_wifi7_pci 0000:06:00.0: rx skb 00000000c391c041 len 1532 peer (null) 0 ucast sn 0 eht320 rate_idx 12 vht_nss 2 freq 6105 band 3 flag 0x40d1a fcs-err 0 mic-err 0 amsdu-more 0 As a result pubsta is NULL and parts of ieee80211_rx_status structure are left uninitialized, which may cause unexpected behavior. Fix it by switching the normal RX path to use ath12k_skb_rxcb::peer_id which is parsed from REO ring's rx_mpdu_desc and is always valid. hal_rx_desc_data::peer_id is still used in ath12k_wifi7_dp_rx_frag_h_mpdu(), which is safe since A-MSDU aggregation does not occur for fragmented frames. Similarly, ath12k_skb_rxcb::peer_id may be overwritten by hal_rx_desc_data::peer_id in ath12k_wifi7_dp_rx_h_mpdu(), which only handles non-aggregated multicast/broadcast traffic. Tested-on: WCN7850 hw2.0 PCI WLAN.HMT.1.1.c5-00302-QCAHMTSWPL_V1.0_V2.0_SILICONZ-1.115823.3 Fixes: 11157e0910fd ("wifi: ath12k: Use ath12k_dp_peer in per packet Tx & Rx paths") Signed-off-by: Baochen Qiang Reviewed-by: Rameshkumar Sundaram Link: https://patch.msgid.link/20260427-ath12k-fix-peer-id-source-v1-1-b5f701fb8e88@oss.qualcomm.com Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath12k/dp_rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath12k/dp_rx.c b/drivers/net/wireless/ath/ath12k/dp_rx.c index 25557dea5826..b108ccd0f637 100644 --- a/drivers/net/wireless/ath/ath12k/dp_rx.c +++ b/drivers/net/wireless/ath/ath12k/dp_rx.c @@ -1340,7 +1340,7 @@ void ath12k_dp_rx_deliver_msdu(struct ath12k_pdev_dp *dp_pdev, struct napi_struc bool is_mcbc = rxcb->is_mcbc; bool is_eapol = rxcb->is_eapol; - peer = ath12k_dp_peer_find_by_peerid(dp_pdev, rx_info->peer_id); + peer = ath12k_dp_peer_find_by_peerid(dp_pdev, rxcb->peer_id); pubsta = peer ? peer->sta : NULL; -- cgit v1.2.3 From d748603f12baff112caa3ab7d39f50100f010dbd Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Tue, 9 Dec 2025 11:04:59 +0100 Subject: wifi: ath5k: do not access array OOB Vincent reports: > The ath5k driver seems to do an array-index-out-of-bounds access as > shown by the UBSAN kernel message: > UBSAN: array-index-out-of-bounds in drivers/net/wireless/ath/ath5k/base.c:1741:20 > index 4 is out of range for type 'ieee80211_tx_rate [4]' > ... > Call Trace: > > dump_stack_lvl+0x5d/0x80 > ubsan_epilogue+0x5/0x2b > __ubsan_handle_out_of_bounds.cold+0x46/0x4b > ath5k_tasklet_tx+0x4e0/0x560 [ath5k] > tasklet_action_common+0xb5/0x1c0 It is real. 'ts->ts_final_idx' can be 3 on 5212, so: info->status.rates[ts->ts_final_idx + 1].idx = -1; with the array defined as: struct ieee80211_tx_rate rates[IEEE80211_TX_MAX_RATES]; while the size is: #define IEEE80211_TX_MAX_RATES 4 is indeed bogus. Set this 'idx = -1' sentinel only if the array index is less than the array size. As mac80211 will not look at rates beyond the size (IEEE80211_TX_MAX_RATES). Note: The effect of the OOB write is negligible. It just overwrites the next member of info->status, i.e. ack_signal. Signed-off-by: Jiri Slaby (SUSE) Reported-by: Vincent Danjean Link: https://lore.kernel.org/all/aQYUkIaT87ccDCin@eldamar.lan Closes: https://bugs.debian.org/1119093 Fixes: 6d7b97b23e11 ("ath5k: fix tx status reporting issues") Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20251209100459.2253198-1-jirislaby@kernel.org Signed-off-by: Jeff Johnson --- drivers/net/wireless/ath/ath5k/base.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath5k/base.c b/drivers/net/wireless/ath/ath5k/base.c index 05c9c07591fc..6ca31d4ea437 100644 --- a/drivers/net/wireless/ath/ath5k/base.c +++ b/drivers/net/wireless/ath/ath5k/base.c @@ -1738,7 +1738,8 @@ ath5k_tx_frame_completed(struct ath5k_hw *ah, struct sk_buff *skb, } info->status.rates[ts->ts_final_idx].count = ts->ts_final_retry; - info->status.rates[ts->ts_final_idx + 1].idx = -1; + if (ts->ts_final_idx + 1 < IEEE80211_TX_MAX_RATES) + info->status.rates[ts->ts_final_idx + 1].idx = -1; if (unlikely(ts->ts_status)) { ah->stats.ack_fail++; -- cgit v1.2.3 From a200cdbf95932631ec338d08a6e9e31b34c4e8a6 Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Mon, 27 Apr 2026 12:00:11 +0800 Subject: ovpn: reset MAC header before passing skb up After decapsulating a packet, the skb->mac_header still points to the outer transport header. Fix this by calling skb_reset_mac_header() in ovpn_netdev_write() to ensure the MAC header points to the beginning of the inner IP/network packet, as expected by the rest of the stack. Reported-by: Minqiang Chen Fixes: 8534731dbf2d ("ovpn: implement packet processing") Signed-off-by: Qingfang Deng Signed-off-by: Antonio Quartulli --- drivers/net/ovpn/io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c index db43a1f8a07a..d92bb87be2b2 100644 --- a/drivers/net/ovpn/io.c +++ b/drivers/net/ovpn/io.c @@ -85,6 +85,7 @@ static void ovpn_netdev_write(struct ovpn_peer *peer, struct sk_buff *skb) skb_scrub_packet(skb, true); /* network header reset in ovpn_decrypt_post() */ + skb_reset_mac_header(skb); skb_reset_transport_header(skb); skb_reset_inner_headers(skb); -- cgit v1.2.3 From c539cb30f93f119566f2ae9d016cce11f188d780 Mon Sep 17 00:00:00 2001 From: Ralf Lici Date: Wed, 25 Mar 2026 17:49:18 +0100 Subject: ovpn: ensure packet delivery happens with BH disabled ovpn injects decrypted packets into the netdev RX path through ovpn_netdev_write() which invokes gro_cells_receive() and dev_dstats_rx_add(). ovpn_netdev_write() is normally called in softirq context, however, in case of TCP connections it may also be invoked process context. When this happens gro_cells_receive() will throw a warning: [ 230.183747][ T12] WARNING: net/core/gro_cells.c:30 at gro_cells_receive+0x708/0xaa0, CPU#1: kworker/u16:0/12 and lockdep will also report a potential inconsistent lock state: WARNING: inconsistent lock state 7.0.0-rc4+ #246 Tainted: G W -------------------------------- inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. because attempts to acquire gro_cells->bh_lock by both contexts may lead to a deadlock. At the same time, dev_dstats_rx_add() does not expect to race with a softirq (which may happen when invoked in process context), because the latter may access its per-cpu state and corrupt it. Fix all this by invoking local_bh_disable/enable() around gro_cells_receive() and dev_dstats_rx_add() to ensure that bottom halves are always disabled before calling both of them. Fixes: 11851cbd60ea ("ovpn: implement TCP transport") Signed-off-by: Ralf Lici Signed-off-by: Antonio Quartulli --- drivers/net/ovpn/io.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c index d92bb87be2b2..22c555dd962e 100644 --- a/drivers/net/ovpn/io.c +++ b/drivers/net/ovpn/io.c @@ -91,12 +91,18 @@ static void ovpn_netdev_write(struct ovpn_peer *peer, struct sk_buff *skb) /* cause packet to be "received" by the interface */ pkt_len = skb->len; + /* we may get here in process context in case of TCP connections, + * therefore we have to disable BHs to ensure gro_cells_receive() + * and dev_dstats_rx_add() do not get corrupted or enter deadlock + */ + local_bh_disable(); ret = gro_cells_receive(&peer->ovpn->gro_cells, skb); if (likely(ret == NET_RX_SUCCESS)) { /* update RX stats with the size of decrypted packet */ ovpn_peer_stats_increment_rx(&peer->vpn_stats, pkt_len); dev_dstats_rx_add(peer->ovpn->dev, pkt_len); } + local_bh_enable(); } void ovpn_decrypt_post(void *data, int ret) -- cgit v1.2.3 From 201ba706318d460a2ea660e3652610be62532a70 Mon Sep 17 00:00:00 2001 From: Ralf Lici Date: Wed, 29 Apr 2026 10:00:16 +0200 Subject: selftests: ovpn: reduce ping count in test.sh The second stage of test.sh ("run baseline data traffic") performs a basic connectivity check with ping -qfc 500 -w 3. On slower CI instances this is too strict for TCP: the RTT is high enough that 500 echo requests do not reliably complete within 3 seconds, so the stage flakes and the test fails even though the ovpn setup is healthy. Reduce the packet count to 100 for both the plain and 3000-byte pings in that stage. This still verifies peer setup, key exchange, routing, and data-path traffic, without making the basic connectivity check depend on timing out under load. Fixes: 959bc330a439 ("testing/selftests: add test tool and scripts for ovpn module") Signed-off-by: Ralf Lici Signed-off-by: Antonio Quartulli --- tools/testing/selftests/net/ovpn/test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/ovpn/test.sh b/tools/testing/selftests/net/ovpn/test.sh index b50dbe45a4d0..c06e3135fbef 100755 --- a/tools/testing/selftests/net/ovpn/test.sh +++ b/tools/testing/selftests/net/ovpn/test.sh @@ -98,10 +98,10 @@ ovpn_run_basic_traffic() { sleep 0.3 ovpn_cmd_ok "send baseline traffic to peer ${p}" \ ip netns exec ovpn_peer0 \ - ping -qfc 500 -w 3 5.5.5.$((p + 1)) + ping -qfc 100 -w 3 5.5.5.$((p + 1)) ovpn_cmd_ok "send large-payload traffic to peer ${p}" \ ip netns exec ovpn_peer0 \ - ping -qfc 500 -s 3000 -w 3 5.5.5.$((p + 1)) + ping -qfc 100 -s 3000 -w 3 5.5.5.$((p + 1)) wait "${tcpdump_pid1}" || return 1 wait "${tcpdump_pid2}" || return 1 -- cgit v1.2.3 From afbd961305eb483515650ccfcb7743608e7add78 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 30 Apr 2026 10:44:13 +0300 Subject: ipvs: fixes for the new ip_vs_status info Sashiko reports some problems for the recently added /proc/net/ip_vs_status: * ip_vs_status_show() as a table reader may run long after the conn_tab and svc_table table are released. While ip_vs_conn_flush() properly changes the conn_tab_changes counter when conn_tab is removed, ip_vs_del_service() and ip_vs_flush() were missing such change for the svc_table_changes counter. As result, readers like ip_vs_dst_event() and ip_vs_status_show() may continue to use a freed table after a cond_resched_rcu() call. * While counting the buckets in ip_vs_status_show() make sure we traverse only the needed number of entries in the chain. This also prevents possible overflow of the 'count' variable. * Add check for 'loops' to prevent infinite loops while restarting the traversal on table change. * While IP_VS_CONN_TAB_MAX_BITS is 20 on 32-bit platforms and there is no risk to overflow when multiplying the number of conn_tab buckets to 100, prefer the div_u64() helper to make the following dividing safer. * Use 0440 permissions for ip_vs_status to restrict the info only to root due to the exported information for hash distribution. Link: https://sashiko.dev/#/patchset/20260410112352.23599-1-fw%40strlen.de Fixes: 9a9ccef907a7 ("ipvs: add ip_vs_status info") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_ctl.c | 51 +++++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 6632daa87ded..27e50afe9a54 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2032,6 +2032,9 @@ static int ip_vs_del_service(struct ip_vs_service *svc) cancel_delayed_work_sync(&ipvs->svc_resize_work); if (t) { rcu_assign_pointer(ipvs->svc_table, NULL); + /* Inform readers that table is removed */ + smp_mb__before_atomic(); + atomic_inc(&ipvs->svc_table_changes); while (1) { p = rcu_dereference_protected(t->new_tbl, 1); call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); @@ -2078,6 +2081,9 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) t = rcu_dereference_protected(ipvs->svc_table, 1); if (t) { rcu_assign_pointer(ipvs->svc_table, NULL); + /* Inform readers that table is removed */ + smp_mb__before_atomic(); + atomic_inc(&ipvs->svc_table_changes); while (1) { p = rcu_dereference_protected(t->new_tbl, 1); call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); @@ -3004,7 +3010,8 @@ static int ip_vs_status_show(struct seq_file *seq, void *v) int old_gen, new_gen; u32 counts[8]; u32 bucket; - int count; + u32 count; + int loops; u32 sum1; u32 sum; int i; @@ -3020,6 +3027,7 @@ static int ip_vs_status_show(struct seq_file *seq, void *v) if (!atomic_read(&ipvs->conn_count)) goto after_conns; old_gen = atomic_read(&ipvs->conn_tab_changes); + loops = 0; repeat_conn: smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */ @@ -3032,8 +3040,11 @@ repeat_conn: resched_score++; ip_vs_rht_walk_bucket_rcu(t, bucket, head) { count = 0; - hlist_bl_for_each_entry_rcu(hn, e, head, node) + hlist_bl_for_each_entry_rcu(hn, e, head, node) { count++; + if (count >= ARRAY_SIZE(counts) - 1) + break; + } } resched_score += count; if (resched_score >= 100) { @@ -3042,37 +3053,41 @@ repeat_conn: new_gen = atomic_read(&ipvs->conn_tab_changes); /* New table installed ? */ if (old_gen != new_gen) { + /* Too many changes? */ + if (++loops >= 5) + goto after_conns; old_gen = new_gen; goto repeat_conn; } } - counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++; + counts[count]++; } } for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++) sum += counts[i]; sum1 = sum - counts[0]; - seq_printf(seq, "Conn buckets empty:\t%u (%lu%%)\n", - counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U)); + seq_printf(seq, "Conn buckets empty:\t%u (%llu%%)\n", + counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U))); for (i = 1; i < ARRAY_SIZE(counts); i++) { if (!counts[i]) continue; - seq_printf(seq, "Conn buckets len-%d:\t%u (%lu%%)\n", + seq_printf(seq, "Conn buckets len-%d:\t%u (%llu%%)\n", i, counts[i], - (unsigned long)counts[i] * 100 / max(sum1, 1U)); + div_u64((u64)counts[i] * 100U, max(sum1, 1U))); } after_conns: t = rcu_dereference(ipvs->svc_table); count = ip_vs_get_num_services(ipvs); - seq_printf(seq, "Services:\t%d\n", count); + seq_printf(seq, "Services:\t%u\n", count); seq_printf(seq, "Service buckets:\t%d (%d bits, lfactor %d)\n", t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0); if (!count) goto after_svc; old_gen = atomic_read(&ipvs->svc_table_changes); + loops = 0; repeat_svc: smp_rmb(); /* ipvs->svc_table and svc_table_changes */ @@ -3086,8 +3101,11 @@ repeat_svc: ip_vs_rht_walk_bucket_rcu(t, bucket, head) { count = 0; hlist_bl_for_each_entry_rcu(svc, e, head, - s_list) + s_list) { count++; + if (count >= ARRAY_SIZE(counts) - 1) + break; + } } resched_score += count; if (resched_score >= 100) { @@ -3096,24 +3114,27 @@ repeat_svc: new_gen = atomic_read(&ipvs->svc_table_changes); /* New table installed ? */ if (old_gen != new_gen) { + /* Too many changes? */ + if (++loops >= 5) + goto after_svc; old_gen = new_gen; goto repeat_svc; } } - counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++; + counts[count]++; } } for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++) sum += counts[i]; sum1 = sum - counts[0]; - seq_printf(seq, "Service buckets empty:\t%u (%lu%%)\n", - counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U)); + seq_printf(seq, "Service buckets empty:\t%u (%llu%%)\n", + counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U))); for (i = 1; i < ARRAY_SIZE(counts); i++) { if (!counts[i]) continue; - seq_printf(seq, "Service buckets len-%d:\t%u (%lu%%)\n", + seq_printf(seq, "Service buckets len-%d:\t%u (%llu%%)\n", i, counts[i], - (unsigned long)counts[i] * 100 / max(sum1, 1U)); + div_u64((u64)counts[i] * 100U, max(sum1, 1U))); } after_svc: @@ -5039,7 +5060,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) ipvs->net->proc_net, ip_vs_stats_percpu_show, NULL)) goto err_percpu; - if (!proc_create_net_single("ip_vs_status", 0, ipvs->net->proc_net, + if (!proc_create_net_single("ip_vs_status", 0440, ipvs->net->proc_net, ip_vs_status_show, NULL)) goto err_status; #endif -- cgit v1.2.3 From f2da9a96abb4b7a64626e931cedd85f05d5498ca Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 30 Apr 2026 10:44:14 +0300 Subject: ipvs: fix races around the conn_lfactor and svc_lfactor sysctl vars Sashiko warns that the new sysctls vars can be changed after the hash tables are destroyed and their respective resizing works canceled, leading to mod_delayed_work() being called for canceled works. Solve this in different ways. conn_tab can be present even without services and is destroyed only on netns exit, so use disable_delayed_work_sync() to disable the work instead of adding more synchronization mechanisms. As for the svc_table, it is destroyed when the services are deleted, so we must be sure that netns exit is not called yet (the check for 'enable') and the work is not canceled by checking all under same mutex lock. Also, use WRITE_ONCE when updating the sysctl vars as we already read them with READ_ONCE. Link: https://sashiko.dev/#/patchset/20260410112352.23599-1-fw%40strlen.de Fixes: 8d7de5477e47 ("ipvs: add conn_lfactor and svc_lfactor sysctl vars") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_conn.c | 2 +- net/netfilter/ipvs/ip_vs_ctl.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 2082bfb2d93c..84a4921a7865 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1835,7 +1835,7 @@ static void ip_vs_conn_flush(struct netns_ipvs *ipvs) if (!rcu_dereference_protected(ipvs->conn_tab, 1)) return; - cancel_delayed_work_sync(&ipvs->conn_resize_work); + disable_delayed_work_sync(&ipvs->conn_resize_work); if (!atomic_read(&ipvs->conn_count)) goto unreg; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 27e50afe9a54..caec516856e9 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2469,7 +2469,7 @@ static int ipvs_proc_conn_lfactor(const struct ctl_table *table, int write, if (val < -8 || val > 8) { ret = -EINVAL; } else { - *valp = val; + WRITE_ONCE(*valp, val); if (rcu_access_pointer(ipvs->conn_tab)) mod_delayed_work(system_unbound_wq, &ipvs->conn_resize_work, 0); @@ -2496,10 +2496,16 @@ static int ipvs_proc_svc_lfactor(const struct ctl_table *table, int write, if (val < -8 || val > 8) { ret = -EINVAL; } else { - *valp = val; - if (rcu_access_pointer(ipvs->svc_table)) + mutex_lock(&ipvs->service_mutex); + WRITE_ONCE(*valp, val); + /* Make sure the services are present */ + if (rcu_access_pointer(ipvs->svc_table) && + READ_ONCE(ipvs->enable) && + !test_bit(IP_VS_WORK_SVC_NORESIZE, + &ipvs->work_flags)) mod_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 0); + mutex_unlock(&ipvs->service_mutex); } } return ret; -- cgit v1.2.3 From d493d9de1c21313cf62be0f6e1a4d48385fa7beb Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 30 Apr 2026 10:44:15 +0300 Subject: ipvs: fix the spin_lock usage for RT build syzbot reports for sleeping function called from invalid context [1]. The recently added code for resizable hash tables uses hlist_bl bit locks in combination with spin_lock for the connection fields (cp->lock). Fix the following problems: * avoid using spin_lock(&cp->lock) under locked bit lock because it sleeps on PREEMPT_RT * as the recent changes call ip_vs_conn_hash() only for newly allocated connection, the spin_lock can be removed there because the connection is still not linked to table and does not need cp->lock protection. * the lock can be removed also from ip_vs_conn_unlink() where we are the last connection user. * the last place that is fixed is ip_vs_conn_fill_cport() where now the cp->lock is locked before the other locks to ensure other packets do not modify the cp->flags in non-atomic way. Here we make sure cport and flags are changed only once if two or more packets race to fill the cport. Also, we fill cport early, so that if we race with resizing there will be valid cport key for the hashing. Add a warning if too many hash table changes occur for our RCU read-side critical section which is error condition but minor because the connection still can expire gracefully. Still, restore the cport to 0 to allow retransmitted packet to properly fill the cport. Problems reported by Sashiko. [1]: BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 16, name: ktimers/0 preempt_count: 2, expected: 0 RCU nest depth: 3, expected: 3 8 locks held by ktimers/0/16: #0: ffffffff8de5f260 (local_bh){.+.+}-{1:3}, at: __local_bh_disable_ip+0x3c/0x420 kernel/softirq.c:163 #1: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: __local_bh_disable_ip+0x3c/0x420 kernel/softirq.c:163 #2: ffff8880b8826360 (&base->expiry_lock){+...}-{3:3}, at: spin_lock include/linux/spinlock_rt.h:45 [inline] #2: ffff8880b8826360 (&base->expiry_lock){+...}-{3:3}, at: timer_base_lock_expiry kernel/time/timer.c:1502 [inline] #2: ffff8880b8826360 (&base->expiry_lock){+...}-{3:3}, at: __run_timer_base+0x120/0x9f0 kernel/time/timer.c:2384 #3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire include/linux/rcupdate.h:300 [inline] #3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline] #3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: __rt_spin_lock kernel/locking/spinlock_rt.c:50 [inline] #3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rt_spin_lock+0x1e0/0x400 kernel/locking/spinlock_rt.c:57 #4: ffffc90000157a80 ((&cp->timer)){+...}-{0:0}, at: call_timer_fn+0xd4/0x5e0 kernel/time/timer.c:1745 #5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire include/linux/rcupdate.h:300 [inline] #5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline] #5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: ip_vs_conn_unlink net/netfilter/ipvs/ip_vs_conn.c:315 [inline] #5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: ip_vs_conn_expire+0x257/0x2390 net/netfilter/ipvs/ip_vs_conn.c:1260 #6: ffffffff8de5f260 (local_bh){.+.+}-{1:3}, at: __local_bh_disable_ip+0x3c/0x420 kernel/softirq.c:163 #7: ffff888068d4c3f0 (&cp->lock#2){+...}-{3:3}, at: spin_lock include/linux/spinlock_rt.h:45 [inline] #7: ffff888068d4c3f0 (&cp->lock#2){+...}-{3:3}, at: ip_vs_conn_unlink net/netfilter/ipvs/ip_vs_conn.c:324 [inline] #7: ffff888068d4c3f0 (&cp->lock#2){+...}-{3:3}, at: ip_vs_conn_expire+0xd4a/0x2390 net/netfilter/ipvs/ip_vs_conn.c:1260 Preemption disabled at: [] bit_spin_lock include/linux/bit_spinlock.h:38 [inline] [] hlist_bl_lock+0x18/0x110 include/linux/list_bl.h:149 CPU: 0 UID: 0 PID: 16 Comm: ktimers/0 Tainted: G W L syzkaller #0 PREEMPT_{RT,(full)} Tainted: [W]=WARN, [L]=SOFTLOCKUP Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/18/2026 Call Trace: dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120 __might_resched+0x329/0x480 kernel/sched/core.c:9162 __rt_spin_lock kernel/locking/spinlock_rt.c:48 [inline] rt_spin_lock+0xc2/0x400 kernel/locking/spinlock_rt.c:57 spin_lock include/linux/spinlock_rt.h:45 [inline] ip_vs_conn_unlink net/netfilter/ipvs/ip_vs_conn.c:324 [inline] ip_vs_conn_expire+0xd4a/0x2390 net/netfilter/ipvs/ip_vs_conn.c:1260 call_timer_fn+0x192/0x5e0 kernel/time/timer.c:1748 expire_timers kernel/time/timer.c:1799 [inline] __run_timers kernel/time/timer.c:2374 [inline] __run_timer_base+0x6a3/0x9f0 kernel/time/timer.c:2386 run_timer_base kernel/time/timer.c:2395 [inline] run_timer_softirq+0xb7/0x170 kernel/time/timer.c:2405 handle_softirqs+0x1de/0x6d0 kernel/softirq.c:622 __do_softirq kernel/softirq.c:656 [inline] run_ktimerd+0x69/0x100 kernel/softirq.c:1151 smpboot_thread_fn+0x541/0xa50 kernel/smpboot.c:160 kthread+0x388/0x470 kernel/kthread.c:436 ret_from_fork+0x514/0xb70 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 Reported-by: syzbot+504e778ddaecd36fdd17@syzkaller.appspotmail.com Link: https://sashiko.dev/#/patchset/20260415200216.79699-1-ja%40ssi.bg Link: https://sashiko.dev/#/patchset/20260420165539.85174-4-ja%40ssi.bg Link: https://sashiko.dev/#/patchset/20260422135823.50489-4-ja%40ssi.bg Fixes: 2fa7cc9c7025 ("ipvs: switch to per-net connection table") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_conn.c | 74 +++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 84a4921a7865..9ea6b4fa78bf 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -267,27 +267,20 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) hash_key2 = hash_key; use2 = false; } + conn_tab_lock(t, cp, hash_key, hash_key2, use2, true /* new_hash */, &head, &head2); - spin_lock(&cp->lock); - - if (!(cp->flags & IP_VS_CONN_F_HASHED)) { - cp->flags |= IP_VS_CONN_F_HASHED; - WRITE_ONCE(cp->hn0.hash_key, hash_key); - WRITE_ONCE(cp->hn1.hash_key, hash_key2); - refcount_inc(&cp->refcnt); - hlist_bl_add_head_rcu(&cp->hn0.node, head); - if (use2) - hlist_bl_add_head_rcu(&cp->hn1.node, head2); - ret = 1; - } else { - pr_err("%s(): request for already hashed, called from %pS\n", - __func__, __builtin_return_address(0)); - ret = 0; - } - spin_unlock(&cp->lock); + cp->flags |= IP_VS_CONN_F_HASHED; + WRITE_ONCE(cp->hn0.hash_key, hash_key); + WRITE_ONCE(cp->hn1.hash_key, hash_key2); + refcount_inc(&cp->refcnt); + hlist_bl_add_head_rcu(&cp->hn0.node, head); + if (use2) + hlist_bl_add_head_rcu(&cp->hn1.node, head2); + conn_tab_unlock(head, head2); + ret = 1; /* Schedule resizing if load increases */ if (atomic_read(&ipvs->conn_count) > t->u_thresh && @@ -321,7 +314,6 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) conn_tab_lock(t, cp, hash_key, hash_key2, use2, false /* new_hash */, &head, &head2); - spin_lock(&cp->lock); if (cp->flags & IP_VS_CONN_F_HASHED) { /* Decrease refcnt and unlink conn only if we are last user */ @@ -334,7 +326,6 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) } } - spin_unlock(&cp->lock); conn_tab_unlock(head, head2); rcu_read_unlock(); @@ -637,6 +628,7 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) struct ip_vs_conn_hnode *hn; u32 hash_key, hash_key_new; struct ip_vs_conn_param p; + bool by_me = false; int ntbl; int dir; @@ -664,8 +656,16 @@ retry: t = rcu_dereference(t->new_tbl); ntbl++; /* We are lost? */ - if (ntbl >= 2) + if (ntbl >= 2) { + spin_lock_bh(&cp->lock); + if (cp->flags & IP_VS_CONN_F_NO_CPORT && by_me) + cp->cport = 0; + /* hn1 will be rehashed on next packet */ + spin_unlock_bh(&cp->lock); + IP_VS_ERR_RL("%s(): Too many ht changes for dir %d\n", + __func__, dir); return; + } } /* Rehashing during resize? Use the recent table for adds */ @@ -683,10 +683,13 @@ retry: if (head > head2 && t == t2) swap(head, head2); + /* Protect the cp->flags modification */ + spin_lock_bh(&cp->lock); + /* Lock seqcount only for the old bucket, even if we are on new table * because it affects the del operation, not the adding. */ - spin_lock_bh(&t->lock[hash_key & t->lock_mask].l); + spin_lock(&t->lock[hash_key & t->lock_mask].l); preempt_disable_nested(); write_seqcount_begin(&t->seqc[hash_key & t->seqc_mask]); @@ -704,14 +707,23 @@ retry: hlist_bl_unlock(head); write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]); preempt_enable_nested(); - spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l); + spin_unlock(&t->lock[hash_key & t->lock_mask].l); + spin_unlock_bh(&cp->lock); hash_key = hash_key_new; goto retry; } - spin_lock(&cp->lock); - if ((cp->flags & IP_VS_CONN_F_NO_CPORT) && - (cp->flags & IP_VS_CONN_F_HASHED)) { + /* Fill cport once, even if multiple packets try to do it */ + if (cp->flags & IP_VS_CONN_F_NO_CPORT && (!cp->cport || by_me)) { + /* If we race with resizing make sure cport is set for dir 1 */ + if (!cp->cport) { + cp->cport = cport; + by_me = true; + } + if (!dir) { + atomic_dec(&ipvs->no_cport_conns[af_id]); + cp->flags &= ~IP_VS_CONN_F_NO_CPORT; + } /* We do not recalc hash_key_r under lock, we assume the * parameters in cp do not change, i.e. cport is * the only possible change. @@ -726,21 +738,17 @@ retry: hlist_bl_del_rcu(&hn->node); hlist_bl_add_head_rcu(&hn->node, head_new); } - if (!dir) { - atomic_dec(&ipvs->no_cport_conns[af_id]); - cp->flags &= ~IP_VS_CONN_F_NO_CPORT; - cp->cport = cport; - } } - spin_unlock(&cp->lock); if (head != head2) hlist_bl_unlock(head2); hlist_bl_unlock(head); write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]); preempt_enable_nested(); - spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l); - if (dir--) + spin_unlock(&t->lock[hash_key & t->lock_mask].l); + + spin_unlock_bh(&cp->lock); + if (dir-- && by_me) goto next_dir; } -- cgit v1.2.3 From fbe1e01e818ee6db86ff947599bf0bea96de7e71 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 30 Apr 2026 10:44:16 +0300 Subject: ipvs: do not leak dest after get from dest trash Sashiko warns about leaked dest if ip_vs_start_estimator() fails in ip_vs_add_dest(). Add ip_vs_trash_put_dest() to put back the dest into dest trash. Link: https://sashiko.dev/#/patchset/20260428175725.72050-1-ja%40ssi.bg Fixes: 705dd3444081 ("ipvs: use kthreads for stats estimation") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_ctl.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index caec516856e9..d81077c2457a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1102,6 +1102,24 @@ out: return dest; } +/* Put destination in trash */ +static void ip_vs_trash_put_dest(struct netns_ipvs *ipvs, + struct ip_vs_dest *dest, unsigned long istart, + bool cleanup) +{ + spin_lock_bh(&ipvs->dest_trash_lock); + IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), + refcount_read(&dest->refcnt)); + if (list_empty(&ipvs->dest_trash) && !cleanup) + mod_timer(&ipvs->dest_trash_timer, + jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); + /* dest lives in trash with reference */ + list_add(&dest->t_list, &ipvs->dest_trash); + dest->idle_start = istart; + spin_unlock_bh(&ipvs->dest_trash_lock); +} + static void ip_vs_dest_rcu_free(struct rcu_head *head) { struct ip_vs_dest *dest; @@ -1461,9 +1479,12 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) ntohs(dest->vport)); ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); + /* On error put back dest into the trash */ if (ret < 0) - return ret; - __ip_vs_update_dest(svc, dest, udest, 1); + ip_vs_trash_put_dest(svc->ipvs, dest, dest->idle_start, + false); + else + __ip_vs_update_dest(svc, dest, udest, 1); } else { /* * Allocate and initialize the dest structure @@ -1533,17 +1554,7 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, */ ip_vs_rs_unhash(dest); - spin_lock_bh(&ipvs->dest_trash_lock); - IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", - IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), - refcount_read(&dest->refcnt)); - if (list_empty(&ipvs->dest_trash) && !cleanup) - mod_timer(&ipvs->dest_trash_timer, - jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); - /* dest lives in trash with reference */ - list_add(&dest->t_list, &ipvs->dest_trash); - dest->idle_start = 0; - spin_unlock_bh(&ipvs->dest_trash_lock); + ip_vs_trash_put_dest(ipvs, dest, 0, cleanup); /* Queue up delayed work to expire all no destination connections. * No-op when CONFIG_SYSCTL is disabled. -- cgit v1.2.3 From 2fd109238925d53c44ea409df0558844af7877b8 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 30 Apr 2026 10:44:17 +0300 Subject: ipvs: fix races around est_mutex and est_cpulist Sashiko reports for races and possible crash around the usage of est_cpulist_valid and sysctl_est_cpulist. The problem is that we do not lock est_mutex in some places which can lead to wrong write ordering and as result problems when calling cpumask_weight() and cpumask_empty(). Fix them by moving the est_max_threads read/write under locked est_mutex. Do the same for one ip_vs_est_reload_start() call to protect the cpumask_empty() usage of sysctl_est_cpulist. To remove the chance of deadlock while stopping the estimation kthreads, keep the data structure for kthread 0 even after last estimator is removed and do not hold mutexes while stopping this task. Now we will use a new flag 'needed' to know when kthread 0 should run. The kthreads above 0 do not use mutexes, so stop them under est_mutex because their kthread data still can be destroyed if they do not serve estimators. Now all kthreads will be started by the est_reload_work to properly serialize the stop/start for kthread 0. Reduce the use of service_mutex in ip_vs_est_calc_phase() because under est_mutex we can safely walk est_kt_arr to stop the kthreads above slot 0. As ip_vs_stop_estimator() for tot_stats should be called under service_mutex, do it early in the netns exit path in ip_vs_flush() to avoid locking the mutex again later. It still should be called in ip_vs_control_net_cleanup_sysctl() when we are called during netns init error. Use -2 for ktid as indicator if estimator was already stopped. Finally, fix use-after-free for kd->est_row in ip_vs_est_calc_phase(). est->ktrow should simply switch to a delay value while estimator is linked to est_temp_list. Link: https://sashiko.dev/#/patchset/20260331165015.2777765-1-longman%40redhat.com Link: https://sashiko.dev/#/patchset/20260420171308.87192-1-ja%40ssi.bg Link: https://sashiko.dev/#/patchset/20260422125123.40658-1-ja%40ssi.bg Link: https://sashiko.dev/#/patchset/20260424175858.54752-1-ja%40ssi.bg Link: https://sashiko.dev/#/patchset/20260425103918.7447-1-ja%40ssi.bg Fixes: f0be83d54217 ("ipvs: add est_cpulist and est_nice sysctl vars") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 11 +++++- net/netfilter/ipvs/ip_vs_ctl.c | 51 +++++++++++++++++++++----- net/netfilter/ipvs/ip_vs_est.c | 83 ++++++++++++++++++++++++------------------ 3 files changed, 100 insertions(+), 45 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 72d325c81313..d28ad8a0541f 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -491,6 +491,7 @@ struct ip_vs_est_kt_data { DECLARE_BITMAP(avail, IPVS_EST_NTICKS); /* tick has space for ests */ unsigned long est_timer; /* estimation timer (jiffies) */ struct ip_vs_stats *calc_stats; /* Used for calculation */ + int needed; /* task is needed */ int tick_len[IPVS_EST_NTICKS]; /* est count */ int id; /* ktid per netns */ int chain_max; /* max ests per tick chain */ @@ -1884,11 +1885,19 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_zero_estimator(struct ip_vs_stats *stats); void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats); -void ip_vs_est_reload_start(struct netns_ipvs *ipvs); +void ip_vs_est_reload_start(struct netns_ipvs *ipvs, bool restart); int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, struct ip_vs_est_kt_data *kd); void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd); +static inline void ip_vs_stop_estimator_tot_stats(struct netns_ipvs *ipvs) +{ +#ifdef CONFIG_SYSCTL + ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); + ipvs->tot_stats->s.est.ktid = -2; +#endif +} + static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs) { #ifdef CONFIG_SYSCTL diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index d81077c2457a..5c9f8e0e238f 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -261,12 +261,28 @@ static void est_reload_work_handler(struct work_struct *work) if (!kd) continue; /* New config ? Stop kthread tasks */ - if (genid != genid_done) - ip_vs_est_kthread_stop(kd); + if (genid != genid_done) { + if (!id) { + /* Only we can stop kt 0 but not under mutex */ + mutex_unlock(&ipvs->est_mutex); + ip_vs_est_kthread_stop(kd); + mutex_lock(&ipvs->est_mutex); + if (!READ_ONCE(ipvs->enable)) + goto unlock; + /* kd for kt 0 is never destroyed */ + } else { + ip_vs_est_kthread_stop(kd); + } + } if (!kd->task && !ip_vs_est_stopped(ipvs)) { + bool start; + /* Do not start kthreads above 0 in calc phase */ - if ((!id || !ipvs->est_calc_phase) && - ip_vs_est_kthread_start(ipvs, kd) < 0) + if (id) + start = !ipvs->est_calc_phase; + else + start = kd->needed; + if (start && ip_vs_est_kthread_start(ipvs, kd) < 0) repeat = true; } } @@ -1823,11 +1839,16 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, *svc_p = svc; if (!READ_ONCE(ipvs->enable)) { + mutex_lock(&ipvs->est_mutex); + /* Now there is a service - full throttle */ WRITE_ONCE(ipvs->enable, 1); + ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); + /* Start estimation for first time */ - ip_vs_est_reload_start(ipvs); + ip_vs_est_reload_start(ipvs, true); + mutex_unlock(&ipvs->est_mutex); } return 0; @@ -2103,6 +2124,11 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) t = p; } } + /* Stop the tot_stats estimator early under service_mutex + * to avoid locking it again later. + */ + if (cleanup) + ip_vs_stop_estimator_tot_stats(ipvs); return 0; } @@ -2348,7 +2374,7 @@ static int ipvs_proc_est_cpumask_set(const struct ctl_table *table, /* est_max_threads may depend on cpulist size */ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); ipvs->est_calc_phase = 1; - ip_vs_est_reload_start(ipvs); + ip_vs_est_reload_start(ipvs, true); unlock: mutex_unlock(&ipvs->est_mutex); @@ -2428,7 +2454,7 @@ static int ipvs_proc_est_nice(const struct ctl_table *table, int write, mutex_lock(&ipvs->est_mutex); if (*valp != val) { *valp = val; - ip_vs_est_reload_start(ipvs); + ip_vs_est_reload_start(ipvs, true); } mutex_unlock(&ipvs->est_mutex); } @@ -2455,7 +2481,7 @@ static int ipvs_proc_run_estimation(const struct ctl_table *table, int write, mutex_lock(&ipvs->est_mutex); if (*valp != val) { *valp = val; - ip_vs_est_reload_start(ipvs); + ip_vs_est_reload_start(ipvs, true); } mutex_unlock(&ipvs->est_mutex); } @@ -5005,7 +5031,14 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); - ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); + if (ipvs->tot_stats->s.est.ktid != -2) { + /* Not stopped yet? This happens only on netns init error and + * we even do not need to lock the service_mutex for this case. + */ + mutex_lock(&ipvs->service_mutex); + ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); + mutex_unlock(&ipvs->service_mutex); + } if (ipvs->est_cpulist_valid) free_cpumask_var(ipvs->sysctl_est_cpulist); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 433ba3cab58c..ab09f5182951 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -68,6 +68,11 @@ and the limit of estimators per kthread - est_add_ktid: ktid where to add new ests, can point to empty slot where we should add kt data + - data protected by service_mutex: est_temp_list, est_add_ktid, + est_kt_count(R/W), est_kt_arr(R/W), est_genid_done, kd->needed(R/W) + - data protected by est_mutex: est_genid, est_max_threads, sysctl_est_cpulist, + est_cpulist_valid, sysctl_est_nice, est_stopped, sysctl_run_estimation, + est_kt_count(R), est_kt_arr(R), kd->needed(R), kd->task (id > 0) */ static struct lock_class_key __ipvs_est_key; @@ -227,14 +232,17 @@ static int ip_vs_estimation_kthread(void *data) } /* Schedule stop/start for kthread tasks */ -void ip_vs_est_reload_start(struct netns_ipvs *ipvs) +void ip_vs_est_reload_start(struct netns_ipvs *ipvs, bool restart) { + lockdep_assert_held(&ipvs->est_mutex); + /* Ignore reloads before first service is added */ if (!READ_ONCE(ipvs->enable)) return; ip_vs_est_stopped_recalc(ipvs); - /* Bump the kthread configuration genid */ - atomic_inc(&ipvs->est_genid); + /* Bump the kthread configuration genid if stopping is requested */ + if (restart) + atomic_inc(&ipvs->est_genid); queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0); } @@ -304,12 +312,17 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) void *arr = NULL; int i; - if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && - READ_ONCE(ipvs->enable) && ipvs->est_max_threads) - return -EINVAL; - mutex_lock(&ipvs->est_mutex); + /* Allow kt 0 data to be created before the services are added + * and limit the kthreads when services are present. + */ + if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && + READ_ONCE(ipvs->enable) && ipvs->est_max_threads) { + ret = -EINVAL; + goto out; + } + for (i = 0; i < id; i++) { if (!ipvs->est_kt_arr[i]) break; @@ -333,6 +346,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) kd->est_timer = jiffies; kd->id = id; ip_vs_est_set_params(ipvs, kd); + kd->needed = 1; /* Pre-allocate stats used in calc phase */ if (!id && !kd->calc_stats) { @@ -341,12 +355,8 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) goto out; } - /* Start kthread tasks only when services are present */ - if (READ_ONCE(ipvs->enable) && !ip_vs_est_stopped(ipvs)) { - ret = ip_vs_est_kthread_start(ipvs, kd); - if (ret < 0) - goto out; - } + /* Request kthread to be started */ + ip_vs_est_reload_start(ipvs, false); if (arr) ipvs->est_kt_count++; @@ -482,12 +492,11 @@ out: /* Start estimation for stats */ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { + struct ip_vs_est_kt_data *kd = ipvs->est_kt_count > 0 ? + ipvs->est_kt_arr[0] : NULL; struct ip_vs_estimator *est = &stats->est; int ret; - if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable)) - ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); - est->ktid = -1; est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */ @@ -496,8 +505,15 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) * will not allocate much memory, just for kt 0. */ ret = 0; - if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0]) + if (!kd) { ret = ip_vs_est_add_kthread(ipvs); + } else if (!kd->needed) { + mutex_lock(&ipvs->est_mutex); + /* We have job for the kt 0 task */ + kd->needed = 1; + ip_vs_est_reload_start(ipvs, true); + mutex_unlock(&ipvs->est_mutex); + } if (ret >= 0) hlist_add_head(&est->list, &ipvs->est_temp_list); else @@ -578,16 +594,14 @@ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) } end_kt0: - /* kt 0 is freed after all other kthreads and chains are empty */ + /* kt 0 task is stopped after all other kt slots and chains are empty */ if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) { kd = ipvs->est_kt_arr[0]; - if (!kd || !kd->est_count) { + if (kd && !kd->est_count) { mutex_lock(&ipvs->est_mutex); - if (kd) { - ip_vs_est_kthread_destroy(kd); - ipvs->est_kt_arr[0] = NULL; - } - ipvs->est_kt_count--; + /* Keep the kt0 data but request kthread_stop */ + kd->needed = 0; + ip_vs_est_reload_start(ipvs, true); mutex_unlock(&ipvs->est_mutex); ipvs->est_add_ktid = 0; } @@ -647,9 +661,9 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) u64 val; INIT_HLIST_HEAD(&chain); - mutex_lock(&ipvs->service_mutex); + mutex_lock(&ipvs->est_mutex); kd = ipvs->est_kt_arr[0]; - mutex_unlock(&ipvs->service_mutex); + mutex_unlock(&ipvs->est_mutex); s = kd ? kd->calc_stats : NULL; if (!s) goto out; @@ -748,16 +762,16 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) if (!ip_vs_est_calc_limits(ipvs, &chain_max)) return; - mutex_lock(&ipvs->service_mutex); - /* Stop all other tasks, so that we can immediately move the * estimators to est_temp_list without RCU grace period */ mutex_lock(&ipvs->est_mutex); for (id = 1; id < ipvs->est_kt_count; id++) { /* netns clean up started, abort */ - if (!READ_ONCE(ipvs->enable)) - goto unlock2; + if (kthread_should_stop() || !READ_ONCE(ipvs->enable)) { + mutex_unlock(&ipvs->est_mutex); + return; + } kd = ipvs->est_kt_arr[id]; if (!kd) continue; @@ -765,9 +779,11 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) } mutex_unlock(&ipvs->est_mutex); + mutex_lock(&ipvs->service_mutex); + /* Move all estimators to est_temp_list but carefully, * all estimators and kthread data can be released while - * we reschedule. Even for kthread 0. + * we reschedule. */ step = 0; @@ -849,9 +865,7 @@ walk_chain: ip_vs_stop_estimator(ipvs, stats); /* Tasks are stopped, move without RCU grace period */ est->ktid = -1; - est->ktrow = row - kd->est_row; - if (est->ktrow < 0) - est->ktrow += IPVS_EST_NTICKS; + est->ktrow = delay; hlist_add_head(&est->list, &ipvs->est_temp_list); /* kd freed ? */ if (last) @@ -889,7 +903,6 @@ end_dequeue: if (genid == atomic_read(&ipvs->est_genid)) ipvs->est_calc_phase = 0; -unlock2: mutex_unlock(&ipvs->est_mutex); unlock: -- cgit v1.2.3 From 4ee52b7021a7cb9356f8b9aff5631c68512a9e1b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 30 Apr 2026 10:44:18 +0300 Subject: ipvs: fix shift-out-of-bounds in ip_vs_rht_desired_size Calling roundup_pow_of_two() with 0 has undefined result: UBSAN: shift-out-of-bounds in ./include/linux/log2.h:57:13 shift exponent 64 is too large for 64-bit type 'unsigned long' CPU: 1 UID: 0 PID: 77 Comm: kworker/u8:4 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/18/2026 Workqueue: events_unbound conn_resize_work_handler Call Trace: dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120 ubsan_epilogue+0xa/0x30 lib/ubsan.c:233 __ubsan_handle_shift_out_of_bounds+0x385/0x410 lib/ubsan.c:494 __roundup_pow_of_two include/linux/log2.h:57 [inline] ip_vs_rht_desired_size+0x2cf/0x410 net/netfilter/ipvs/ip_vs_core.c:240 ip_vs_conn_desired_size net/netfilter/ipvs/ip_vs_conn.c:765 [inline] conn_resize_work_handler+0x1b6/0x14c0 net/netfilter/ipvs/ip_vs_conn.c:822 process_one_work kernel/workqueue.c:3302 [inline] process_scheduled_works+0xb5d/0x1860 kernel/workqueue.c:3385 worker_thread+0xa53/0xfc0 kernel/workqueue.c:3466 kthread+0x388/0x470 kernel/kthread.c:436 ret_from_fork+0x514/0xb70 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 Reported-by: syzbot+217f1db9c791e27fe54a@syzkaller.appspotmail.com Fixes: b655388111cf ("ipvs: add resizable hash tables") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index f5b7a2047291..d40b404c1bf6 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -237,7 +237,7 @@ int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n, { if (!t) return 1 << min_bits; - n = roundup_pow_of_two(n); + n = n > 0 ? roundup_pow_of_two(n) : 1; if (lfactor < 0) { int factor = min(-lfactor, max_bits); -- cgit v1.2.3 From aa6065206987278291c09d0c6aebed687114c925 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 30 Apr 2026 10:44:19 +0300 Subject: ipvs: Guard access of HK_TYPE_KTHREAD cpumask with RCU The ip_vs_ctl.c file and the associated ip_vs.h file are the only places in the kernel where HK_TYPE_KTHREAD cpumask is being retrieved and used. Now that HK_TYPE_KTHREAD/HK_TYPE_DOMAIN cpumask can be changed at run time. We need to use RCU to guard access to this cpumask to avoid a potential UAF problem as the returned cpumask may be freed before it is being used. We can replace HK_TYPE_KTHREAD by HK_TYPE_DOMAIN as they are aliases of each other, but keeping the HK_TYPE_KTHREAD name can highlight the fact that it is the kthread initiated by ipvs that is being controlled. Fixes: 03ff73510169 ("cpuset: Update HK_TYPE_DOMAIN cpumask from cpuset") Signed-off-by: Waiman Long Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 20 ++++++++++++++++---- net/netfilter/ipvs/ip_vs_ctl.c | 13 ++++++++----- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index d28ad8a0541f..02762ce73a0c 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1412,7 +1412,7 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) return ipvs->sysctl_run_estimation; } -static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) +static inline const struct cpumask *__sysctl_est_cpulist(struct netns_ipvs *ipvs) { if (ipvs->est_cpulist_valid) return ipvs->sysctl_est_cpulist; @@ -1530,7 +1530,7 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) return 1; } -static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) +static inline const struct cpumask *__sysctl_est_cpulist(struct netns_ipvs *ipvs) { return housekeeping_cpumask(HK_TYPE_KTHREAD); } @@ -1565,6 +1565,18 @@ static inline int sysctl_svc_lfactor(struct netns_ipvs *ipvs) return READ_ONCE(ipvs->sysctl_svc_lfactor); } +static inline bool sysctl_est_cpulist_empty(struct netns_ipvs *ipvs) +{ + guard(rcu)(); + return cpumask_empty(__sysctl_est_cpulist(ipvs)); +} + +static inline unsigned int sysctl_est_cpulist_weight(struct netns_ipvs *ipvs) +{ + guard(rcu)(); + return cpumask_weight(__sysctl_est_cpulist(ipvs)); +} + /* IPVS core functions * (from ip_vs_core.c) */ @@ -1904,7 +1916,7 @@ static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs) /* Stop tasks while cpulist is empty or if disabled with flag */ ipvs->est_stopped = !sysctl_run_estimation(ipvs) || (ipvs->est_cpulist_valid && - cpumask_empty(sysctl_est_cpulist(ipvs))); + sysctl_est_cpulist_empty(ipvs)); #endif } @@ -1920,7 +1932,7 @@ static inline bool ip_vs_est_stopped(struct netns_ipvs *ipvs) static inline int ip_vs_est_max_threads(struct netns_ipvs *ipvs) { unsigned int limit = IPVS_EST_CPU_KTHREADS * - cpumask_weight(sysctl_est_cpulist(ipvs)); + sysctl_est_cpulist_weight(ipvs); return max(1U, limit); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 5c9f8e0e238f..c7c7f6a7a9f6 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2394,11 +2394,14 @@ static int ipvs_proc_est_cpumask_get(const struct ctl_table *table, mutex_lock(&ipvs->est_mutex); - if (ipvs->est_cpulist_valid) - mask = *valp; - else - mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); - ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); + /* HK_TYPE_KTHREAD cpumask needs RCU protection */ + scoped_guard(rcu) { + if (ipvs->est_cpulist_valid) + mask = *valp; + else + mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); + ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); + } mutex_unlock(&ipvs->est_mutex); -- cgit v1.2.3 From 8f78b749f3da0f43990490b4c1193b5ede3eec0a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 30 Apr 2026 10:44:20 +0300 Subject: sched/isolation: Make HK_TYPE_KTHREAD an alias of HK_TYPE_DOMAIN Since commit 041ee6f3727a ("kthread: Rely on HK_TYPE_DOMAIN for preferred affinity management"), kthreads default to use the HK_TYPE_DOMAIN cpumask. IOW, it is no longer affected by the setting of the nohz_full boot kernel parameter. That means HK_TYPE_KTHREAD should now be an alias of HK_TYPE_DOMAIN instead of HK_TYPE_KERNEL_NOISE to correctly reflect the current kthread behavior. Make the change as HK_TYPE_KTHREAD is still being used in some networking code. Fixes: 041ee6f3727a ("kthread: Rely on HK_TYPE_DOMAIN for preferred affinity management") Signed-off-by: Waiman Long Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/linux/sched/isolation.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index dc3975ff1b2e..cf0fd03dd7a2 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -20,6 +20,11 @@ enum hk_type { HK_TYPE_KERNEL_NOISE, HK_TYPE_MAX, + /* + * HK_TYPE_KTHREAD is now an alias of HK_TYPE_DOMAIN + */ + HK_TYPE_KTHREAD = HK_TYPE_DOMAIN, + /* * The following housekeeping types are only set by the nohz_full * boot commandline option. So they can share the same value. @@ -29,7 +34,6 @@ enum hk_type { HK_TYPE_RCU = HK_TYPE_KERNEL_NOISE, HK_TYPE_MISC = HK_TYPE_KERNEL_NOISE, HK_TYPE_WQ = HK_TYPE_KERNEL_NOISE, - HK_TYPE_KTHREAD = HK_TYPE_KERNEL_NOISE }; #ifdef CONFIG_CPU_ISOLATION -- cgit v1.2.3 From d82ba05263c69fa2437fe93e4e561cc40f4c03af Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 1 May 2026 07:39:41 +0000 Subject: af_unix: Set gc_in_progress to true in unix_gc(). Igor Ushakov reported that unix_gc() could run with gc_in_progress being false if the work is scheduled while running: Thread 1 Thread 2 Thread 3 -------- -------- -------- unix_schedule_gc() unix_schedule_gc() `- if (!gc_in_progress) `- if (!gc_in_progress) |- gc_in_progress = true | `- queue_work() | unix_gc() <----------------/ | | |- gc_in_progress = true ... `- queue_work() | | `- gc_in_progress = false | | unix_gc() <---------------------------------------------' | ... /* gc_in_progress == false */ | `- gc_in_progress = false unix_peek_fpl() relies on gc_in_progress not to confuse GC by MSG_PEEK. Let's set gc_in_progress to true in unix_gc(). Fixes: 8b90a9f819dc ("af_unix: Run GC on only one CPU.") Reported-by: Igor Ushakov Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260501073945.1884564-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/unix/garbage.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index a7967a345827..0783555e2526 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -607,6 +607,8 @@ static void unix_gc(struct work_struct *work) struct sk_buff_head hitlist; struct sk_buff *skb; + WRITE_ONCE(gc_in_progress, true); + spin_lock(&unix_gc_lock); if (unix_graph_state == UNIX_GRAPH_NOT_CYCLIC) { @@ -649,10 +651,8 @@ void unix_schedule_gc(struct user_struct *user) READ_ONCE(user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) return; - if (!READ_ONCE(gc_in_progress)) { - WRITE_ONCE(gc_in_progress, true); + if (!READ_ONCE(gc_in_progress)) queue_work(system_dfl_wq, &unix_gc_work); - } if (user && READ_ONCE(unix_graph_cyclic_sccs)) flush_work(&unix_gc_work); -- cgit v1.2.3 From 76b93a8107574006b25495664304ea9237494d70 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 1 May 2026 02:58:41 -0700 Subject: netpoll: pass buffer size to egress_dev() to avoid MAC truncation egress_dev() formats np->dev_mac via snprintf() but receives buf as a bare char *, so it cannot derive the buffer size from the pointer. The size argument was hardcoded to MAC_ADDR_STR_LEN (3 * ETH_ALEN - 1 = 17), which is silly wrong in two ways: 1) misleading kernel log output on the MAC-selected target path (np->dev_name[0] == '\0'); for example "aa:bb:cc:dd:ee:ff doesn't exist, aborting" was logged as "aa:bb:cc:dd:ee:f doesn't exist, aborting". 2) the second argument of snprintf is the size of the buffer, not the size of what you want to write. Add a bufsz parameter to egress_dev() and pass sizeof(buf) from each caller, matching the standard snprintf() idiom and removing the hardcoded size from the helper. Every caller already declares "char buf[MAC_ADDR_STR_LEN + 1]" so the formatted MAC continues to fit. Tested by booting with netconsole=6665@/aa:bb:cc:dd:ee:ff,6666@10.0.0.1/00:11:22:33:44:55 on a kernel without a matching device. Pre-fix dmesg shows "aa:bb:cc:dd:ee:f doesn't exist, aborting"; post-fix shows the full "aa:bb:cc:dd:ee:ff doesn't exist, aborting". Fixes: f8a10bed32f5 ("netconsole: allow selection of egress interface via MAC address") Cc: stable@vger.kernel.org Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20260501-netpoll_snprintf_fix-v1-1-84b0566e6597@debian.org Signed-off-by: Jakub Kicinski --- net/core/netpoll.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 4381e0fc25bf..84faace50ac2 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -608,14 +608,16 @@ EXPORT_SYMBOL_GPL(__netpoll_setup); /* * Returns a pointer to a string representation of the identifier used * to select the egress interface for the given netpoll instance. buf - * must be a buffer of length at least MAC_ADDR_STR_LEN + 1. + * is used to format np->dev_mac when np->dev_name is empty; bufsz must + * be at least MAC_ADDR_STR_LEN + 1 to fit the formatted MAC address + * and its NUL terminator. */ -static char *egress_dev(struct netpoll *np, char *buf) +static char *egress_dev(struct netpoll *np, char *buf, size_t bufsz) { if (np->dev_name[0]) return np->dev_name; - snprintf(buf, MAC_ADDR_STR_LEN, "%pM", np->dev_mac); + snprintf(buf, bufsz, "%pM", np->dev_mac); return buf; } @@ -645,7 +647,7 @@ static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev) if (!IS_ENABLED(CONFIG_IPV6)) { np_err(np, "IPv6 is not supported %s, aborting\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); return -EINVAL; } @@ -667,7 +669,7 @@ static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev) } if (err) { np_err(np, "no IPv6 address for %s, aborting\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); return err; } @@ -687,14 +689,14 @@ static int netpoll_take_ipv4(struct netpoll *np, struct net_device *ndev) in_dev = __in_dev_get_rtnl(ndev); if (!in_dev) { np_err(np, "no IP address for %s, aborting\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); return -EDESTADDRREQ; } ifa = rtnl_dereference(in_dev->ifa_list); if (!ifa) { np_err(np, "no IP address for %s, aborting\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); return -EDESTADDRREQ; } @@ -736,7 +738,8 @@ int netpoll_setup(struct netpoll *np) ndev = dev_getbyhwaddr(net, ARPHRD_ETHER, np->dev_mac); if (!ndev) { - np_err(np, "%s doesn't exist, aborting\n", egress_dev(np, buf)); + np_err(np, "%s doesn't exist, aborting\n", + egress_dev(np, buf, sizeof(buf))); err = -ENODEV; goto unlock; } @@ -744,14 +747,14 @@ int netpoll_setup(struct netpoll *np) if (netdev_master_upper_dev_get(ndev)) { np_err(np, "%s is a slave device, aborting\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); err = -EBUSY; goto put; } if (!netif_running(ndev)) { np_info(np, "device %s not up yet, forcing it\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); err = dev_open(ndev, NULL); if (err) { -- cgit v1.2.3 From 36bdc0e815b4e8a05b9028d8ef8a25e1ead35cc1 Mon Sep 17 00:00:00 2001 From: Markus Baier Date: Fri, 1 May 2026 18:39:41 +0200 Subject: net: usb: asix: ax88772: re-add usbnet_link_change() in phylink callbacks Commit e0bffe3e6894 ("net: asix: ax88772: migrate to phylink") replaced the asix_adjust_link() PHY callback with phylink's mac_link_up() and mac_link_down() handlers, but did not carry over the usbnet_link_change() notification that commit 805206e66fab ("net: asix: fix "can't send until first packet is send" issue") had added. As a result, the original symptom returns: when the link comes up, usbnet is never notified, so the RX URB submission stays dormant until some other event (e.g. a transmitted packet triggering the status endpoint interrupt) wakes it up. This is reproducible with the Apple A1277 USB Ethernet Adapter (05ac:1402, AX88772A based) on a Banana Pro using a static IPv4 configuration. After bringing the interface up, no incoming packets are received until the first outgoing frame triggers usbnet's RX path. Restore the link change notification, gated on a carrier transition so the call remains idempotent if the status endpoint also reports the change later. Fixes: e0bffe3e6894 ("net: asix: ax88772: migrate to phylink") Signed-off-by: Markus Baier Tested-by: Oleksij Rempel Link: https://patch.msgid.link/20260501163941.107668-1-Markus.Baier@soslab.tu-darmstadt.de Signed-off-by: Jakub Kicinski --- drivers/net/usb/asix_devices.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c index df0bcfedddbc..293ef80c4e30 100644 --- a/drivers/net/usb/asix_devices.c +++ b/drivers/net/usb/asix_devices.c @@ -756,6 +756,7 @@ static void ax88772_mac_link_down(struct phylink_config *config, struct usbnet *dev = netdev_priv(to_net_dev(config->dev)); asix_write_medium_mode(dev, 0, 0); + usbnet_link_change(dev, false, false); } static void ax88772_mac_link_up(struct phylink_config *config, @@ -786,6 +787,7 @@ static void ax88772_mac_link_up(struct phylink_config *config, m |= AX_MEDIUM_RFC; asix_write_medium_mode(dev, m, 0); + usbnet_link_change(dev, true, false); } static const struct phylink_mac_ops ax88772_phylink_mac_ops = { -- cgit v1.2.3 From 059b7dbd20a6f0c539a45ddff1573cb8946685b5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Apr 2026 12:26:52 +0000 Subject: vsock/virtio: fix potential unbounded skb queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit virtio_transport_inc_rx_pkt() checks vvs->rx_bytes + len > vvs->buf_alloc. virtio_transport_recv_enqueue() skips coalescing for packets with VIRTIO_VSOCK_SEQ_EOM. If fed with packets with len == 0 and VIRTIO_VSOCK_SEQ_EOM, a very large number of packets can be queued because vvs->rx_bytes stays at 0. Fix this by estimating the skb metadata size: (Number of skbs in the queue) * SKB_TRUESIZE(0) Fixes: 077706165717 ("virtio/vsock: don't use skbuff state to account credit") Signed-off-by: Eric Dumazet Cc: Arseniy Krasnov Cc: Stefan Hajnoczi Cc: Stefano Garzarella Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Xuan Zhuo Cc: "Eugenio Pérez" Cc: virtualization@lists.linux.dev Link: https://patch.msgid.link/20260430122653.554058-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/virtio_transport_common.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 416d533f493d..9b8014516f4f 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -447,7 +447,9 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, u32 len) { - if (vvs->buf_used + len > vvs->buf_alloc) + u64 skb_overhead = (skb_queue_len(&vvs->rx_queue) + 1) * SKB_TRUESIZE(0); + + if (skb_overhead + vvs->buf_used + len > vvs->buf_alloc) return false; vvs->rx_bytes += len; -- cgit v1.2.3 From c4a99a921949cddc590b22bb14eeb23dffcc3ba6 Mon Sep 17 00:00:00 2001 From: Shardul Bankar Date: Fri, 1 May 2026 21:35:34 +0200 Subject: mptcp: use MPJoinSynAckHMacFailure for SynAck HMAC failure In subflow_finish_connect(), HMAC validation of the server's HMAC in SYN/ACK + MP_JOIN increments MPTCP_MIB_JOINACKMAC ("HMAC was wrong on ACK + MP_JOIN") on failure. The function processes the SYN/ACK, not the ACK; the matching MPTCP_MIB_JOINSYNACKMAC counter ("HMAC was wrong on SYN/ACK + MP_JOIN") exists but is not incremented anywhere in the tree. The mirror site on the server, subflow_syn_recv_sock(), already uses JOINACKMAC correctly for ACK HMAC failure. Use JOINSYNACKMAC at the SYN/ACK validation site so each counter reflects the packet whose HMAC actually failed. Suggested-by: Matthieu Baerts (NGI0) Fixes: fc518953bc9c ("mptcp: add and use MIB counter infrastructure") Cc: stable@vger.kernel.org Signed-off-by: Shardul Bankar Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260501-net-mptcp-misc-fixes-7-1-rc3-v1-1-b70118df778e@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index e2cb9d23e4a0..bda6862264ca 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -581,7 +581,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->backup); if (!subflow_thmac_valid(subflow)) { - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKMAC); subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } -- cgit v1.2.3 From a6da02d4c00fdda2417e42ad2b762a9209e6cc49 Mon Sep 17 00:00:00 2001 From: Shardul Bankar Date: Fri, 1 May 2026 21:35:35 +0200 Subject: mptcp: use MPTCP_RST_EMPTCP for ACK HMAC validation failure When HMAC validation fails on a received ACK + MP_JOIN in subflow_syn_recv_sock(), the subflow is reset with reason MPTCP_RST_EPROHIBIT ("Administratively prohibited"). This is incorrect: HMAC validation failure is an MPTCP protocol-level error, not an administrative policy denial. The mirror site on the client, in subflow_finish_connect(), already uses MPTCP_RST_EMPTCP ("MPTCP-specific error") for the same kind of HMAC failure on the SYN/ACK + MP_JOIN. Use the same reason on the server side for symmetry and accuracy. Suggested-by: Matthieu Baerts (NGI0) Fixes: 443041deb5ef ("mptcp: fix NULL pointer in can_accept_new_subflow") Cc: stable@vger.kernel.org Signed-off-by: Shardul Bankar Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260501-net-mptcp-misc-fixes-7-1-rc3-v1-2-b70118df778e@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index bda6862264ca..d562e149606f 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -908,7 +908,7 @@ create_child: if (!subflow_hmac_valid(subflow_req, &mp_opt)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); - subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); + subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); goto dispose_child; } -- cgit v1.2.3 From 6254a16d6f0c672e3809ca5d7c9a28a55d71f764 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 1 May 2026 21:35:36 +0200 Subject: mptcp: fix rx timestamp corruption on fastopen The skb cb offset containing the timestamp presence flag is cleared before loading such information. Cache such value before MPTCP CB initialization. Fixes: 36b122baf6a8 ("mptcp: add subflow_v(4,6)_send_synack()") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260501-net-mptcp-misc-fixes-7-1-rc3-v1-3-b70118df778e@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/fastopen.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c index 82ec15bcfd7f..082c46c0f50e 100644 --- a/net/mptcp/fastopen.c +++ b/net/mptcp/fastopen.c @@ -12,6 +12,7 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf struct sock *sk, *ssk; struct sk_buff *skb; struct tcp_sock *tp; + bool has_rxtstamp; /* on early fallback the subflow context is deleted by * subflow_syn_recv_sock() @@ -40,12 +41,13 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf */ tp->copied_seq += skb->len; subflow->ssn_offset += skb->len; + has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; /* Only the sequence delta is relevant */ MPTCP_SKB_CB(skb)->map_seq = -skb->len; MPTCP_SKB_CB(skb)->end_seq = 0; MPTCP_SKB_CB(skb)->offset = 0; - MPTCP_SKB_CB(skb)->has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; + MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; MPTCP_SKB_CB(skb)->cant_coalesce = 1; mptcp_data_lock(sk); -- cgit v1.2.3 From 70ece9d7021c54cf40c72b31b066e9088f5f75f5 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 1 May 2026 21:35:37 +0200 Subject: mptcp: sockopt: increase seq in mptcp_setsockopt_all_sf mptcp_setsockopt_all_sf() was missing a call to sockopt_seq_inc(). This is required not to cause missing synchronization for newer subflows created later on. This helper is called each time a socket option is set on subflows, and future ones will need to inherit this option after their creation. Fixes: 51c5fd09e1b4 ("mptcp: add TCP_MAXSEG sockopt support") Cc: stable@vger.kernel.org Suggested-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260501-net-mptcp-misc-fixes-7-1-rc3-v1-4-b70118df778e@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/sockopt.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 0efe40be2fde..1cf608e7357b 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -812,6 +812,10 @@ static int mptcp_setsockopt_all_sf(struct mptcp_sock *msk, int level, if (ret) break; } + + if (!ret) + sockopt_seq_inc(msk); + return ret; } -- cgit v1.2.3 From ac0841d7d202073415c808bda7848502163b87dd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 2 May 2026 12:41:02 +0000 Subject: net: prevent possible UAF in rtnl_prop_list_size() I was mistaken by synchronize_rcu() [1] call in netdev_name_node_alt_destroy(), giving a false sense of RCU safety at delete times. We have to use list_del_rcu() to not confuse potential readers in rtnl_prop_list_size(). [1] This synchronize_rcu() call was later removed in commit 723de3ebef03 ("net: free altname using an RCU callback"). Fixes: 9f30831390ed ("net: add rcu safety to rtnl_prop_list_size()") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260502124102.499204-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 06c195906231..8bfa8313ef62 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -371,7 +371,7 @@ static void netdev_name_node_alt_free(struct rcu_head *head) static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) { netdev_name_node_del(name_node); - list_del(&name_node->list); + list_del_rcu(&name_node->list); call_rcu(&name_node->rcu, netdev_name_node_alt_free); } -- cgit v1.2.3 From 30cb24f97d44f6b81c14b85c5323de62eef1fb7f Mon Sep 17 00:00:00 2001 From: David Carlier Date: Sat, 2 May 2026 15:19:45 +0100 Subject: psp: strip variable-length PSP header in psp_dev_rcv() psp_dev_rcv() unconditionally removes a fixed PSP_ENCAP_HLEN, even when psph->hdrlen indicates that the PSP header carries optional fields. A frame whose PSP header advertises a non-zero VC or any extension would therefore be silently mis-decapsulated: option bytes would spill into the inner packet head and downstream parsing would fail on a corrupted skb. Compute the full PSP header length from psph->hdrlen, pull the optional bytes into the linear region, and strip the whole header when decapsulating. Optional fields (VC, ...) are still ignored, just discarded with the rest of the header instead of leaking. crypt_offset and the VIRT flag are intentionally not validated here - callers know their device's PSP implementation and can decide. Both in-tree callers gate on hardware-validated PSP, so this is a correctness fix rather than a reachable corruption path under current configurations. Fixes: 0eddb8023cee ("psp: provide decapsulation and receive helper for drivers") Reviewed-by: Willem de Bruijn Reviewed-by: Daniel Zahka Cc: stable@vger.kernel.org Signed-off-by: David Carlier Link: https://patch.msgid.link/20260502141945.14484-1-devnexen@gmail.com Signed-off-by: Jakub Kicinski --- net/psp/psp_main.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index 9508b6c38003..e45549f08eef 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -263,15 +263,16 @@ EXPORT_SYMBOL(psp_dev_encapsulate); /* Receive handler for PSP packets. * - * Presently it accepts only already-authenticated packets and does not - * support optional fields, such as virtualization cookies. The caller should - * ensure that skb->data is pointing to the mac header, and that skb->mac_len - * is set. This function does not currently adjust skb->csum (CHECKSUM_COMPLETE - * is not supported). + * Accepts only already-authenticated packets. The full PSP header is + * stripped according to psph->hdrlen; any optional fields it advertises + * (virtualization cookies, etc.) are ignored and discarded along with the + * rest of the header. The caller should ensure that skb->data is pointing + * to the mac header, and that skb->mac_len is set. This function does not + * currently adjust skb->csum (CHECKSUM_COMPLETE is not supported). */ int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv) { - int l2_hlen = 0, l3_hlen, encap; + int l2_hlen = 0, l3_hlen, encap, psp_hlen; struct psp_skb_ext *pse; struct psphdr *psph; struct ethhdr *eth; @@ -312,18 +313,36 @@ int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv) if (unlikely(uh->dest != htons(PSP_DEFAULT_UDP_PORT))) return -EINVAL; - pse = skb_ext_add(skb, SKB_EXT_PSP); - if (!pse) + psph = (struct psphdr *)(skb->data + l2_hlen + l3_hlen + + sizeof(struct udphdr)); + + /* Strip the full PSP header per psph->hdrlen; VC/options are pulled + * into the linear region only so they can be discarded with the + * rest of the header. + */ + psp_hlen = (psph->hdrlen + 1) * 8; + + if (unlikely(psp_hlen < sizeof(struct psphdr))) + return -EINVAL; + + if (psp_hlen > sizeof(struct psphdr) && + !pskb_may_pull(skb, l2_hlen + l3_hlen + + sizeof(struct udphdr) + psp_hlen)) return -EINVAL; psph = (struct psphdr *)(skb->data + l2_hlen + l3_hlen + sizeof(struct udphdr)); + + pse = skb_ext_add(skb, SKB_EXT_PSP); + if (!pse) + return -EINVAL; + pse->spi = psph->spi; pse->dev_id = dev_id; pse->generation = generation; pse->version = FIELD_GET(PSPHDR_VERFL_VERSION, psph->verfl); - encap = PSP_ENCAP_HLEN; + encap = sizeof(struct udphdr) + psp_hlen; encap += strip_icv ? PSP_TRL_SIZE : 0; if (proto == htons(ETH_P_IP)) { @@ -340,8 +359,9 @@ int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv) ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) - encap); } - memmove(skb->data + PSP_ENCAP_HLEN, skb->data, l2_hlen + l3_hlen); - skb_pull(skb, PSP_ENCAP_HLEN); + memmove(skb->data + sizeof(struct udphdr) + psp_hlen, + skb->data, l2_hlen + l3_hlen); + skb_pull(skb, sizeof(struct udphdr) + psp_hlen); if (strip_icv) pskb_trim(skb, skb->len - PSP_TRL_SIZE); -- cgit v1.2.3 From a6039776c7994dd0b9a4acce23a3f897d1688cbf Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 2 May 2026 18:07:47 +0000 Subject: ipmr: Add __rcu to netns_ipv4.mrt. kernel test robot reported this Sparse warning: $ make C=1 net/ipv4/ipmr.o net/ipv4/ipmr.c:312:24: error: incompatible types in comparison expression (different address spaces): net/ipv4/ipmr.c:312:24: struct mr_table [noderef] __rcu * net/ipv4/ipmr.c:312:24: struct mr_table * Let's add __rcu annotation to netns_ipv4.mrt. Fixes: b3b6babf4751 ("ipmr: Free mr_table after RCU grace period.") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202605030032.glNApko7-lkp@intel.com/ Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260502180755.359554-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 80ccd4dda8e0..6e27c56514df 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -275,7 +275,7 @@ struct netns_ipv4 { #ifdef CONFIG_IP_MROUTE #ifndef CONFIG_IP_MROUTE_MULTIPLE_TABLES - struct mr_table *mrt; + struct mr_table __rcu *mrt; #else struct list_head mr_tables; struct fib_rules_ops *mr_rules_ops; -- cgit v1.2.3 From 07d99587396024932e02474c3a5bede71d108454 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Sat, 2 May 2026 11:55:02 +0100 Subject: net: dsa: mt7530: fix .get_stats64 sleeping in atomic context The .get_stats64 callback runs in atomic context, but on MDIO-connected switches every register read acquires the MDIO bus mutex, which can sleep: [ 12.645973] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:609 [ 12.654442] in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 759, name: grep [ 12.663377] preempt_count: 0, expected: 0 [ 12.667410] RCU nest depth: 1, expected: 0 [ 12.671511] INFO: lockdep is turned off. [ 12.675441] CPU: 0 UID: 0 PID: 759 Comm: grep Tainted: G S W 7.0.0+ #0 PREEMPT [ 12.675453] Tainted: [S]=CPU_OUT_OF_SPEC, [W]=WARN [ 12.675456] Hardware name: Bananapi BPI-R64 (DT) [ 12.675459] Call trace: [ 12.675462] show_stack+0x14/0x1c (C) [ 12.675477] dump_stack_lvl+0x68/0x8c [ 12.675487] dump_stack+0x14/0x1c [ 12.675495] __might_resched+0x14c/0x220 [ 12.675504] __might_sleep+0x44/0x80 [ 12.675511] __mutex_lock+0x50/0xb10 [ 12.675523] mutex_lock_nested+0x20/0x30 [ 12.675532] mt7530_get_stats64+0x40/0x2ac [ 12.675542] dsa_user_get_stats64+0x2c/0x40 [ 12.675553] dev_get_stats+0x44/0x1e0 [ 12.675564] dev_seq_printf_stats+0x24/0xe0 [ 12.675575] dev_seq_show+0x14/0x3c [ 12.675583] seq_read_iter+0x37c/0x480 [ 12.675595] seq_read+0xd0/0xec [ 12.675605] proc_reg_read+0x94/0xe4 [ 12.675615] vfs_read+0x98/0x29c [ 12.675625] ksys_read+0x54/0xdc [ 12.675633] __arm64_sys_read+0x18/0x20 [ 12.675642] invoke_syscall.constprop.0+0x54/0xec [ 12.675653] do_el0_svc+0x3c/0xb4 [ 12.675662] el0_svc+0x38/0x200 [ 12.675670] el0t_64_sync_handler+0x98/0xdc [ 12.675679] el0t_64_sync+0x158/0x15c For MDIO-connected switches, poll MIB counters asynchronously using a delayed workqueue every second and let .get_stats64 return the cached values under a spinlock. A mod_delayed_work() call on each read triggers an immediate refresh so counters stay responsive when queried more frequently. MMIO-connected switches (MT7988, EN7581, AN7583) are not affected because their regmap does not sleep, so they continue to read MIB counters directly in .get_stats64. Fixes: 88c810f35ed5 ("net: dsa: mt7530: implement .get_stats64") Signed-off-by: Daniel Golle Acked-by: Chester A. Unal Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/6940b913da2c29156f0feff74b678d3c526ee84c.1777719253.git.daniel@makrotopia.org Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mt7530.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++-- drivers/net/dsa/mt7530.h | 8 ++++++ 2 files changed, 80 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index b9423389c2ef..44d670904ad8 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -25,6 +25,9 @@ #include "mt7530.h" +#define MT7530_STATS_POLL_INTERVAL (1 * HZ) +#define MT7530_STATS_RATE_LIMIT (HZ / 10) + static struct mt753x_pcs *pcs_to_mt753x_pcs(struct phylink_pcs *pcs) { return container_of(pcs, struct mt753x_pcs, pcs); @@ -906,10 +909,9 @@ static void mt7530_get_rmon_stats(struct dsa_switch *ds, int port, *ranges = mt7530_rmon_ranges; } -static void mt7530_get_stats64(struct dsa_switch *ds, int port, - struct rtnl_link_stats64 *storage) +static void mt7530_read_port_stats64(struct mt7530_priv *priv, int port, + struct rtnl_link_stats64 *storage) { - struct mt7530_priv *priv = ds->priv; uint64_t data; /* MIB counter doesn't provide a FramesTransmittedOK but instead @@ -951,6 +953,54 @@ static void mt7530_get_stats64(struct dsa_switch *ds, int port, &storage->rx_crc_errors); } +static void mt7530_stats_refresh(struct mt7530_priv *priv) +{ + struct rtnl_link_stats64 stats = {}; + struct dsa_port *dp; + int port; + + dsa_switch_for_each_user_port(dp, priv->ds) { + port = dp->index; + + mt7530_read_port_stats64(priv, port, &stats); + + spin_lock_bh(&priv->stats_lock); + priv->ports[port].stats = stats; + priv->stats_last = jiffies; + spin_unlock_bh(&priv->stats_lock); + } +} + +static void mt7530_stats_poll(struct work_struct *work) +{ + struct mt7530_priv *priv = container_of(work, struct mt7530_priv, + stats_work.work); + + mt7530_stats_refresh(priv); + schedule_delayed_work(&priv->stats_work, + MT7530_STATS_POLL_INTERVAL); +} + +static void mt7530_get_stats64(struct dsa_switch *ds, int port, + struct rtnl_link_stats64 *storage) +{ + struct mt7530_priv *priv = ds->priv; + bool refresh; + + if (priv->bus) { + spin_lock_bh(&priv->stats_lock); + *storage = priv->ports[port].stats; + refresh = time_after(jiffies, priv->stats_last + + MT7530_STATS_RATE_LIMIT); + spin_unlock_bh(&priv->stats_lock); + if (refresh) + mod_delayed_work(system_percpu_wq, + &priv->stats_work, 0); + } else { + mt7530_read_port_stats64(priv, port, storage); + } +} + static void mt7530_get_eth_ctrl_stats(struct dsa_switch *ds, int port, struct ethtool_eth_ctrl_stats *ctrl_stats) { @@ -3137,9 +3187,24 @@ mt753x_setup(struct dsa_switch *ds) if (ret && priv->irq_domain) mt7530_free_mdio_irq(priv); + if (!ret && priv->bus) { + mt7530_stats_refresh(priv); + schedule_delayed_work(&priv->stats_work, + MT7530_STATS_POLL_INTERVAL); + } + return ret; } +static void +mt753x_teardown(struct dsa_switch *ds) +{ + struct mt7530_priv *priv = ds->priv; + + if (priv->bus) + cancel_delayed_work_sync(&priv->stats_work); +} + static int mt753x_set_mac_eee(struct dsa_switch *ds, int port, struct ethtool_keee *e) { @@ -3257,6 +3322,7 @@ static int mt7988_setup(struct dsa_switch *ds) static const struct dsa_switch_ops mt7530_switch_ops = { .get_tag_protocol = mtk_get_tag_protocol, .setup = mt753x_setup, + .teardown = mt753x_teardown, .preferred_default_local_cpu_port = mt753x_preferred_default_local_cpu_port, .get_strings = mt7530_get_strings, .get_ethtool_stats = mt7530_get_ethtool_stats, @@ -3395,6 +3461,9 @@ mt7530_probe_common(struct mt7530_priv *priv) priv->ds->ops = &mt7530_switch_ops; priv->ds->phylink_mac_ops = &mt753x_phylink_mac_ops; mutex_init(&priv->reg_mutex); + spin_lock_init(&priv->stats_lock); + INIT_DELAYED_WORK(&priv->stats_work, mt7530_stats_poll); + dev_set_drvdata(dev, priv); return 0; diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index 3e0090bed298..dd33b0df3419 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -796,6 +796,7 @@ struct mt7530_fdb { * @pvid: The VLAN specified is to be considered a PVID at ingress. Any * untagged frames will be assigned to the related VLAN. * @sgmii_pcs: Pointer to PCS instance for SerDes ports + * @stats: Cached port statistics for MDIO-connected switches */ struct mt7530_port { bool enable; @@ -803,6 +804,7 @@ struct mt7530_port { u32 pm; u16 pvid; struct phylink_pcs *sgmii_pcs; + struct rtnl_link_stats64 stats; }; /* Port 5 mode definitions of the MT7530 switch */ @@ -875,6 +877,9 @@ struct mt753x_info { * @create_sgmii: Pointer to function creating SGMII PCS instance(s) * @active_cpu_ports: Holding the active CPU ports * @mdiodev: The pointer to the MDIO device structure + * @stats_lock: Protects cached per-port stats from concurrent access + * @stats_work: Delayed work for polling MIB counters on MDIO switches + * @stats_last: Jiffies timestamp of last MIB counter poll */ struct mt7530_priv { struct device *dev; @@ -900,6 +905,9 @@ struct mt7530_priv { int (*create_sgmii)(struct mt7530_priv *priv); u8 active_cpu_ports; struct mdio_device *mdiodev; + spinlock_t stats_lock; /* protects cached stats counters */ + struct delayed_work stats_work; + unsigned long stats_last; }; struct mt7530_hw_vlan_entry { -- cgit v1.2.3 From f4c50a4034e62ab75f1d5cdd191dd5f9c77fdff4 Mon Sep 17 00:00:00 2001 From: Kuan-Ting Chen Date: Mon, 4 May 2026 23:27:12 +0800 Subject: xfrm: esp: avoid in-place decrypt on shared skb frags MSG_SPLICE_PAGES can attach pages from a pipe directly to an skb. TCP marks such skbs with SKBFL_SHARED_FRAG after skb_splice_from_iter(), so later paths that may modify packet data can first make a private copy. The IPv4/IPv6 datagram append paths did not set this flag when splicing pages into UDP skbs. That leaves an ESP-in-UDP packet made from shared pipe pages looking like an ordinary uncloned nonlinear skb. ESP input then takes the no-COW fast path for uncloned skbs without a frag_list and decrypts in place over data that is not owned privately by the skb. Mark IPv4/IPv6 datagram splice frags with SKBFL_SHARED_FRAG, matching TCP. Also make ESP input fall back to skb_cow_data() when the flag is present, so ESP does not decrypt externally backed frags in place. Private nonlinear skb frags still use the existing fast path. This intentionally does not change ESP output. In esp_output_head(), the path that appends the ESP trailer to existing skb tailroom without calling skb_cow_data() is not reachable for nonlinear skbs: skb_tailroom() returns zero when skb->data_len is nonzero, while ESP tailen is positive. Thus ESP output will either use the separate destination-frag path or fall back to skb_cow_data(). Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Fixes: 7da0dde68486 ("ip, udp: Support MSG_SPLICE_PAGES") Fixes: 6d8192bd69bb ("ip6, udp6: Support MSG_SPLICE_PAGES") Reported-by: Hyunwoo Kim Reported-by: Kuan-Ting Chen Tested-by: Hyunwoo Kim Cc: stable@vger.kernel.org Signed-off-by: Kuan-Ting Chen Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 3 ++- net/ipv4/ip_output.c | 2 ++ net/ipv6/esp6.c | 3 ++- net/ipv6/ip6_output.c | 2 ++ 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 6dfc0bcdef65..6a5febbdbee4 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -873,7 +873,8 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) nfrags = 1; goto skip_cow; - } else if (!skb_has_frag_list(skb)) { + } else if (!skb_has_frag_list(skb) && + !skb_has_shared_frag(skb)) { nfrags = skb_shinfo(skb)->nr_frags; nfrags++; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e4790cc7b5c2..5bcd73cbdb41 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1233,6 +1233,8 @@ alloc_new_skb: if (err < 0) goto error; copy = err; + if (!(flags & MSG_NO_SHARED_FRAGS)) + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; wmem_alloc_delta += copy; } else if (!zc) { int i = skb_shinfo(skb)->nr_frags; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 9f75313734f8..9c06c5a1419d 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -915,7 +915,8 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) nfrags = 1; goto skip_cow; - } else if (!skb_has_frag_list(skb)) { + } else if (!skb_has_frag_list(skb) && + !skb_has_shared_frag(skb)) { nfrags = skb_shinfo(skb)->nr_frags; nfrags++; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 7e92909ab5be..1f2a33fbed6e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1794,6 +1794,8 @@ alloc_new_skb: if (err < 0) goto error; copy = err; + if (!(flags & MSG_NO_SHARED_FRAGS)) + skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; wmem_alloc_delta += copy; } else if (!zc) { int i = skb_shinfo(skb)->nr_frags; -- cgit v1.2.3 From 4a142520d166f91627f27a7017525a228137c808 Mon Sep 17 00:00:00 2001 From: Jakov Novak Date: Mon, 4 May 2026 18:23:57 +0200 Subject: wifi: libertas: notify firmware load wait on disconnect Currently, when the firmware is not fully loaded and if_usb_disconnect is called, if_usb_prog_firmware gets stuck waiting for cardp->surprise_removed or cardp->fwdnldover while lbs_remove_card also waits for the firmware loading to be completed, which never happens. This caused the reported syzbot bug. To address this, the wake_up function call can be added in the if_usb_disconnect function which notifies the if_usb_prog_firmware thread and resolves the firmware loading. Fixes: 954ee164f4f4 ("[PATCH] libertas: reorganize and simplify init sequence") Reported-and-tested-by: syzbot+c99d17aa44dbdba16ad2@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=c99d17aa44dbdba16ad2 Signed-off-by: Jakov Novak Link: https://patch.msgid.link/20260504162356.17250-2-jakovnovak30@gmail.com [fix subject] Signed-off-by: Johannes Berg --- drivers/net/wireless/marvell/libertas/if_usb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/marvell/libertas/if_usb.c b/drivers/net/wireless/marvell/libertas/if_usb.c index a00d53350fa9..5cc0c5cac257 100644 --- a/drivers/net/wireless/marvell/libertas/if_usb.c +++ b/drivers/net/wireless/marvell/libertas/if_usb.c @@ -310,6 +310,7 @@ static void if_usb_disconnect(struct usb_interface *intf) struct lbs_private *priv = cardp->priv; cardp->surprise_removed = 1; + wake_up(&cardp->fw_wq); if (priv) { lbs_stop_card(priv); -- cgit v1.2.3 From e9e334f8063a991b4f648b8dbb8dac44cf810540 Mon Sep 17 00:00:00 2001 From: Dipayaan Roy Date: Wed, 29 Apr 2026 20:57:52 -0700 Subject: net: mana: check xdp_rxq registration before unreg in mana_destroy_rxq() When mana_create_rxq() fails at mana_create_wq_obj() or any step before xdp_rxq_info_reg() is called, the error path jumps to `out:` which calls mana_destroy_rxq(). mana_destroy_rxq() unconditionally calls xdp_rxq_info_unreg() on xilinx xdp_rxq that was never registered, triggering a WARN_ON in net/core/xdp.c: mana 7870:00:00.0: HWC: Failed hw_channel req: 0xc000009a mana 7870:00:00.0 eth7: Failed to create RXQ: err = -71 Driver BUG WARNING: CPU: 442 PID: 491615 at ../net/core/xdp.c:150 xdp_rxq_info_unreg+0x44/0x70 Modules linked in: tcp_bbr xsk_diag udp_diag raw_diag unix_diag af_packet_diag netlink_diag nf_tables nfnetlink tcp_diag inet_diag binfmt_misc rpcsec_gss_krb5 nfsv3 nfs_acl auth_rpcgss nfsv4 dns_resolver nfs lockd ext4 grace crc16 iscsi_tcp mbcache fscache libiscsi_tcp jbd2 netfs rpcrdma af_packet sunrpc rdma_ucm ib_iser rdma_cm iw_cm iscsi_ibft ib_cm iscsi_boot_sysfs libiscsi rfkill scsi_transport_iscsi mana_ib ib_uverbs ib_core mana hyperv_drm(X) drm_shmem_helper intel_rapl_msr drm_kms_helper intel_rapl_common syscopyarea nls_iso8859_1 sysfillrect intel_uncore_frequency_common nls_cp437 vfat fat nfit sysimgblt libnvdimm hv_netvsc(X) hv_utils(X) fb_sys_fops hv_balloon(X) joydev fuse drm dm_mod configfs ip_tables x_tables xfs libcrc32c sd_mod nvme nvme_core nvme_common t10_pi crc64_rocksoft_generic crc64_rocksoft crc64 hid_generic serio_raw pci_hyperv(X) hv_storvsc(X) scsi_transport_fc hyperv_keyboard(X) hid_hyperv(X) pci_hyperv_intf(X) crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel crypto_simd cryptd hv_vmbus(X) softdog sg scsi_mod efivarfs Supported: Yes, External CPU: 442 PID: 491615 Comm: ethtool Kdump: loaded Tainted: G X 5.14.21-150500.55.136-default #1 SLE15-SP5 a627be1b53abbfd64ad16b2685e4308c52847f42 Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 07/25/2025 RIP: 0010:xdp_rxq_info_unreg+0x44/0x70 Code: e8 91 fe ff ff c7 43 0c 02 00 00 00 48 c7 03 00 00 00 00 5b c3 cc cc cc cc e9 58 3a 1c 00 48 c7 c7 f6 5f 19 97 e8 5c a4 7e ff <0f> 0b 83 7b 0c 01 74 ca 48 c7 c7 d9 5f 19 97 e8 48 a4 7e ff 0f 0b RSP: 0018:ff3df6c8f7207818 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ff30d89f94808a80 RCX: 0000000000000027 RDX: 0000000000000000 RSI: 0000000000000002 RDI: ff30d94bdcca2908 RBP: 0000000000080000 R08: ffffffff98ed11a0 R09: ff3df6c8f72077a0 R10: dead000000000100 R11: 000000000000000a R12: 0000000000000000 R13: 0000000000002000 R14: 0000000000040000 R15: ff30d89f94800000 FS: 00007fe6d8432b80(0000) GS:ff30d94bdcc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fe6d81a89b1 CR3: 00000b3b6d578001 CR4: 0000000000371ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 Call Trace: mana_destroy_rxq+0x5b/0x2f0 [mana 267acf7006bcb696095bba4d810643d1db3b9e94] mana_create_rxq.isra.55+0x3db/0x720 [mana 267acf7006bcb696095bba4d810643d1db3b9e94] ? simple_lookup+0x36/0x50 ? current_time+0x42/0x80 ? __d_free_external+0x30/0x30 mana_alloc_queues+0x32a/0x470 [mana 267acf7006bcb696095bba4d810643d1db3b9e94] ? _raw_spin_unlock+0xa/0x30 ? d_instantiate.part.29+0x2e/0x40 ? _raw_spin_unlock+0xa/0x30 ? debugfs_create_dir+0xe4/0x140 mana_attach+0x5c/0xf0 [mana 267acf7006bcb696095bba4d810643d1db3b9e94] mana_set_ringparam+0xd5/0x1a0 [mana 267acf7006bcb696095bba4d810643d1db3b9e94] ethnl_set_rings+0x292/0x320 genl_family_rcv_msg_doit.isra.15+0x11b/0x150 genl_rcv_msg+0xe3/0x1e0 ? rings_prepare_data+0x80/0x80 ? genl_family_rcv_msg_doit.isra.15+0x150/0x150 netlink_rcv_skb+0x50/0x100 genl_rcv+0x24/0x40 netlink_unicast+0x1b6/0x280 netlink_sendmsg+0x365/0x4d0 sock_sendmsg+0x5f/0x70 __sys_sendto+0x112/0x140 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x5b/0x80 ? handle_mm_fault+0xd7/0x290 ? do_user_addr_fault+0x2d8/0x740 ? exc_page_fault+0x67/0x150 entry_SYSCALL_64_after_hwframe+0x6b/0xd5 RIP: 0033:0x7fe6d8122f06 Code: 00 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 41 89 ca 64 8b 04 25 18 00 00 00 85 c0 75 11 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 72 f3 c3 41 57 41 56 4d 89 c7 41 55 41 54 41 RSP: 002b:00007fff2b66b068 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 000055771123d2a0 RCX: 00007fe6d8122f06 RDX: 0000000000000034 RSI: 000055771123d3b0 RDI: 0000000000000003 RBP: 00007fff2b66b100 R08: 00007fe6d8203360 R09: 000000000000000c R10: 0000000000000000 R11: 0000000000000246 R12: 000055771123d350 R13: 000055771123d340 R14: 0000000000000000 R15: 00007fff2b66b2b0 Guard the xdp_rxq_info_unreg() call with xdp_rxq_info_is_reg() so that mana_destroy_rxq() is safe to call regardless of how far initialization progressed. Fixes: ed5356b53f07 ("net: mana: Add XDP support") Reviewed-by: Haiyang Zhang Signed-off-by: Dipayaan Roy Link: https://patch.msgid.link/20260430035935.1859220-2-dipayanroy@linux.microsoft.com Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/mana_en.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index a654b3699c4c..dfb4ba9f7664 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -2520,7 +2520,9 @@ static void mana_destroy_rxq(struct mana_port_context *apc, napi_disable_locked(napi); netif_napi_del_locked(napi); } - xdp_rxq_info_unreg(&rxq->xdp_rxq); + + if (xdp_rxq_info_is_reg(&rxq->xdp_rxq)) + xdp_rxq_info_unreg(&rxq->xdp_rxq); mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj); -- cgit v1.2.3 From 2a1c691182823a5c149d502ac153e249ee697b4a Mon Sep 17 00:00:00 2001 From: Dipayaan Roy Date: Wed, 29 Apr 2026 20:57:53 -0700 Subject: net: mana: Skip WQ object destruction for uninitialized RXQ In mana_destroy_rxq(), mana_destroy_wq_obj() is called unconditionally even when the WQ object was never created (rxobj is still INVALID_MANA_HANDLE). When mana_create_rxq() fails before mana_create_wq_obj() succeeds, the error path calls mana_destroy_rxq() which sends a bogus destroy command to the hardware: mana 7870:00:00.0: HWC: Failed hw_channel req: 0x1d mana 7870:00:00.0: Failed to send mana message: -71, 0x1d mana 7870:00:00.0 eth7: Failed to destroy WQ object: -71 Guard mana_destroy_wq_obj() with an INVALID_MANA_HANDLE check so that mana_destroy_rxq() is safe to call at any stage of RXQ initialization. Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)") Reviewed-by: Haiyang Zhang Signed-off-by: Dipayaan Roy Link: https://patch.msgid.link/20260430035935.1859220-3-dipayanroy@linux.microsoft.com Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/mana_en.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index dfb4ba9f7664..f2a6ea162dc3 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -2524,7 +2524,8 @@ static void mana_destroy_rxq(struct mana_port_context *apc, if (xdp_rxq_info_is_reg(&rxq->xdp_rxq)) xdp_rxq_info_unreg(&rxq->xdp_rxq); - mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj); + if (rxq->rxobj != INVALID_MANA_HANDLE) + mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj); mana_deinit_cq(apc, &rxq->rx_cq); -- cgit v1.2.3 From 3985c9a56da49af8b2e45cb1fa55c03c89b1d471 Mon Sep 17 00:00:00 2001 From: Dipayaan Roy Date: Wed, 29 Apr 2026 20:57:54 -0700 Subject: net: mana: remove double CQ cleanup in mana_create_rxq error path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In mana_create_rxq(), the error cleanup path calls mana_destroy_rxq() followed by mana_deinit_cq(). This is incorrect for two reasons: 1. mana_destroy_rxq() already calls mana_deinit_cq() internally, so the CQ's GDMA queue is destroyed twice. 2. mana_destroy_rxq() frees the rxq via kfree(rxq) before returning. The subsequent mana_deinit_cq(apc, cq) then operates on freed memory since cq points to &rxq->rx_cq, which is embedded in the already-freed rxq structure — a use-after-free. Remove the redundant mana_deinit_cq() call from the error path since mana_destroy_rxq() already handles CQ cleanup. mana_deinit_cq() is itself safe for an uninitialized CQ as it checks for a NULL gdma_cq before proceeding. Fixes: ca9c54d2d6a5 ("net: mana: Add a driver for Microsoft Azure Network Adapter (MANA)") Reviewed-by: Haiyang Zhang Signed-off-by: Dipayaan Roy Reviewed-by: Aditya Garg Link: https://patch.msgid.link/20260430035935.1859220-4-dipayanroy@linux.microsoft.com Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/mana_en.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index f2a6ea162dc3..9afc786b297a 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -2799,9 +2799,6 @@ out: mana_destroy_rxq(apc, rxq, false); - if (cq) - mana_deinit_cq(apc, cq); - return NULL; } -- cgit v1.2.3 From 83861c48ba122f85cc8384780764b3a791341678 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Thu, 30 Apr 2026 23:32:50 +0200 Subject: openvswitch: vport: fix race between tunnel creation and linking When a tunnel vport is created it first creates the tunnel device, e.g., with geneve_dev_create_fb(), then it calls ovs_netdev_link() to take a reference and link it to the device that represents openvswitch datapath. The creation of the device is happening under RTNL, but then RTNL is released and re-acquired to find the device by name. It is technically possible for the tunnel device to be re-named or deleted within that window while RTNL is not held, and some other device created in its place. This will cause a non-tunnel device to be referenced in the vport and tunnel-specific functions used on it, e.g. vxlan_get_options() that directly casts the private netdev data into a struct vxlan_dev causing an invalid memory access: BUG: KASAN: slab-use-after-free in vxlan_get_options+0x323/0x3a0 vxlan_get_options+0x323/0x3a0 ovs_vport_cmd_new+0x6e3/0xd30 Fix that by taking a reference to the just created device before releasing RTNL. This ensures that the device in the vport is always the one that was just created. The search by name is only needed for a standard vport-netdev that links pre-existing devices, so that functionality and device type checks are moved to netdev_create(). It is also awkward that ovs_netdev_link() takes ownership of the vport and destroys it on failure. It doesn't know the type of the port it is dealing with, so we need to pass down the indicator that it's a tunnel, so the link can be properly deleted on failure. It's possible to refactor the logic to make the ovs_netdev_link() do only the linking part and let the callers perform a proper destruction, but it will be much more code for each legacy tunnel port type, so it is not worth it for the bug fix. Fixes: 614732eaa12d ("openvswitch: Use regular VXLAN net_device device") Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Reported-by: Yang Yang Signed-off-by: Ilya Maximets Acked-by: Eelco Chaudron Link: https://patch.msgid.link/20260430213349.407991-1-i.maximets@ovn.org Signed-off-by: Paolo Abeni --- net/openvswitch/vport-geneve.c | 5 +++- net/openvswitch/vport-gre.c | 5 +++- net/openvswitch/vport-netdev.c | 58 +++++++++++++++++++++++++----------------- net/openvswitch/vport-netdev.h | 2 +- net/openvswitch/vport-vxlan.c | 5 +++- 5 files changed, 48 insertions(+), 27 deletions(-) diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index b10e1602c6b1..cb5ea4424ffc 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -97,6 +97,9 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms) goto error; } + vport->dev = dev; + netdev_hold(vport->dev, &vport->dev_tracker, GFP_KERNEL); + rtnl_unlock(); return vport; error: @@ -111,7 +114,7 @@ static struct vport *geneve_create(const struct vport_parms *parms) if (IS_ERR(vport)) return vport; - return ovs_netdev_link(vport, parms->name); + return ovs_netdev_link(vport, true); } static struct vport_ops ovs_geneve_vport_ops = { diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 4014c9b5eb79..6cb5a697b396 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -63,6 +63,9 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms) return ERR_PTR(err); } + vport->dev = dev; + netdev_hold(vport->dev, &vport->dev_tracker, GFP_KERNEL); + rtnl_unlock(); return vport; } @@ -75,7 +78,7 @@ static struct vport *gre_create(const struct vport_parms *parms) if (IS_ERR(vport)) return vport; - return ovs_netdev_link(vport, parms->name); + return ovs_netdev_link(vport, true); } static struct vport_ops ovs_gre_vport_ops = { diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 12055af832dc..a92ca8b37f96 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -73,37 +73,21 @@ static struct net_device *get_dpdev(const struct datapath *dp) return local->dev; } -struct vport *ovs_netdev_link(struct vport *vport, const char *name) +struct vport *ovs_netdev_link(struct vport *vport, bool tunnel) { int err; - vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name); - if (!vport->dev) { + if (WARN_ON_ONCE(!vport->dev)) { err = -ENODEV; goto error_free_vport; } - /* Ensure that the device exists and that the provided - * name is not one of its aliases. - */ - if (strcmp(name, ovs_vport_name(vport))) { - err = -ENODEV; - goto error_put; - } - netdev_tracker_alloc(vport->dev, &vport->dev_tracker, GFP_KERNEL); - if (vport->dev->flags & IFF_LOOPBACK || - (vport->dev->type != ARPHRD_ETHER && - vport->dev->type != ARPHRD_NONE) || - ovs_is_internal_dev(vport->dev)) { - err = -EINVAL; - goto error_put; - } rtnl_lock(); err = netdev_master_upper_dev_link(vport->dev, get_dpdev(vport->dp), NULL, NULL, NULL); if (err) - goto error_unlock; + goto error_put_unlock; err = netdev_rx_handler_register(vport->dev, netdev_frame_hook, vport); @@ -119,10 +103,11 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name) error_master_upper_dev_unlink: netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp)); -error_unlock: - rtnl_unlock(); -error_put: +error_put_unlock: + if (tunnel && vport->dev->reg_state == NETREG_REGISTERED) + rtnl_delete_link(vport->dev, 0, NULL); netdev_put(vport->dev, &vport->dev_tracker); + rtnl_unlock(); error_free_vport: ovs_vport_free(vport); return ERR_PTR(err); @@ -132,12 +117,39 @@ EXPORT_SYMBOL_GPL(ovs_netdev_link); static struct vport *netdev_create(const struct vport_parms *parms) { struct vport *vport; + int err; vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms); if (IS_ERR(vport)) return vport; - return ovs_netdev_link(vport, parms->name); + vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name); + if (!vport->dev) { + err = -ENODEV; + goto error_free_vport; + } + netdev_tracker_alloc(vport->dev, &vport->dev_tracker, GFP_KERNEL); + + /* Ensure that the provided name is not an alias. */ + if (strcmp(parms->name, ovs_vport_name(vport))) { + err = -ENODEV; + goto error_put; + } + + if (vport->dev->flags & IFF_LOOPBACK || + (vport->dev->type != ARPHRD_ETHER && + vport->dev->type != ARPHRD_NONE) || + ovs_is_internal_dev(vport->dev)) { + err = -EINVAL; + goto error_put; + } + + return ovs_netdev_link(vport, false); +error_put: + netdev_put(vport->dev, &vport->dev_tracker); +error_free_vport: + ovs_vport_free(vport); + return ERR_PTR(err); } static void vport_netdev_free(struct rcu_head *rcu) diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h index c5d83a43bfc4..6c0d7366f986 100644 --- a/net/openvswitch/vport-netdev.h +++ b/net/openvswitch/vport-netdev.h @@ -13,7 +13,7 @@ struct vport *ovs_netdev_get_vport(struct net_device *dev); -struct vport *ovs_netdev_link(struct vport *vport, const char *name); +struct vport *ovs_netdev_link(struct vport *vport, bool tunnel); void ovs_netdev_detach_dev(struct vport *); int __init ovs_netdev_init(void); diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 0b881b043bcf..c1b37b50d29e 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -126,6 +126,9 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) goto error; } + vport->dev = dev; + netdev_hold(vport->dev, &vport->dev_tracker, GFP_KERNEL); + rtnl_unlock(); return vport; error: @@ -140,7 +143,7 @@ static struct vport *vxlan_create(const struct vport_parms *parms) if (IS_ERR(vport)) return vport; - return ovs_netdev_link(vport, parms->name); + return ovs_netdev_link(vport, true); } static struct vport_ops ovs_vxlan_netdev_vport_ops = { -- cgit v1.2.3 From aa69918bd418e700309fdd08509dba324fb24296 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 1 May 2026 01:38:37 +0200 Subject: openvswitch: vport: fix self-deadlock on release of tunnel ports vports are used concurrently and protected by RCU, so netdev_put() must happen after the RCU grace period. So, either in an RCU call or after the synchronize_net(). The rtnl_delete_link() must happen under RTNL and so can't be executed in RCU context. Calling synchronize_net() while holding RTNL is not a good idea for performance and system stability under load in general, so calling netdev_put() in RCU call is the right solution here. However, when the device is deleted, rtnl_unlock() will call netdev_run_todo() and block until all the references are gone. In the current code this means that we never reach the call_rcu() and the vport is never freed and the reference is never released, causing a self-deadlock on device removal. Fix that by moving the rcu_call() before the rtnl_unlock(), so the scheduled RCU callback will be executed when synchronize_net() is called from the rtnl_unlock()->netdev_run_todo() while the RTNL itself is already released. Fixes: 6931d21f87bc ("openvswitch: defer tunnel netdev_put to RCU release") Cc: stable@vger.kernel.org Acked-by: Eelco Chaudron Signed-off-by: Ilya Maximets Acked-by: Aaron Conole Link: https://patch.msgid.link/20260430233848.440994-2-i.maximets@ovn.org Signed-off-by: Paolo Abeni --- net/openvswitch/vport-netdev.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index a92ca8b37f96..c42642075685 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -208,9 +208,13 @@ void ovs_netdev_tunnel_destroy(struct vport *vport) */ if (vport->dev->reg_state == NETREG_REGISTERED) rtnl_delete_link(vport->dev, 0, NULL); - rtnl_unlock(); + /* We can't put the device reference yet, since it can still be in + * use, but rtnl_unlock()->netdev_run_todo() will block until all + * the references are released, so the RCU call must be before it. + */ call_rcu(&vport->rcu, vport_netdev_free); + rtnl_unlock(); } EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy); -- cgit v1.2.3 From 05416ada37aa4efe93f25b0532f551d424fb7b3d Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Fri, 1 May 2026 01:38:38 +0200 Subject: selftests: openvswitch: add tests for tunnel vport refcounting There were a few issues found with the tunnel vport types around the vport destruction code. Add some basic tests, so at least we know that they can be properly added and removed without obvious issues. The test creates OVS datapath, adds a non-LWT tunnel port, makes sure they are created, and then removes the datapath and waits for all the ports to be gone. The dpctl script had a few bugs in the none-lwt tunnel creation code, so fixing them as well to make the testing possible: - The type of the --lwt option changed in order to properly disable it. - Removed byte order conversion for the port numbers, as the value supposed to be in the host order. - Added missing 'gre' choice for the tunnel type. Signed-off-by: Ilya Maximets Acked-by: Eelco Chaudron Acked-by: Aaron Conole Link: https://patch.msgid.link/20260430233848.440994-3-i.maximets@ovn.org Signed-off-by: Paolo Abeni --- .../selftests/net/openvswitch/openvswitch.sh | 37 ++++++++++++++++++++++ .../testing/selftests/net/openvswitch/ovs-dpctl.py | 19 +++++++---- 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/openvswitch/openvswitch.sh b/tools/testing/selftests/net/openvswitch/openvswitch.sh index b327d3061ed5..3cdd953f6813 100755 --- a/tools/testing/selftests/net/openvswitch/openvswitch.sh +++ b/tools/testing/selftests/net/openvswitch/openvswitch.sh @@ -26,6 +26,7 @@ tests=" netlink_checks ovsnl: validate netlink attrs and settings upcall_interfaces ovs: test the upcall interfaces tunnel_metadata ovs: test extraction of tunnel metadata + tunnel_refcount ovs: test tunnel vport reference cleanup drop_reason drop: test drop reasons are emitted psample psample: Sampling packets with psample" @@ -830,6 +831,42 @@ test_tunnel_metadata() { return 0 } +test_tunnel_refcount() { + sbxname="test_tunnel_refcount" + sbx_add "${sbxname}" || return 1 + + ovs_sbx "${sbxname}" ip netns add trefns || return 1 + on_exit "ovs_sbx ${sbxname} ip netns del trefns" + + for tun_type in gre vxlan geneve; do + info "testing ${tun_type} tunnel vport refcount" + + ovs_sbx "${sbxname}" ip netns exec trefns \ + python3 $ovs_base/ovs-dpctl.py \ + add-dp dp-${tun_type} || return 1 + + ovs_sbx "${sbxname}" ip netns exec trefns \ + python3 $ovs_base/ovs-dpctl.py \ + add-if --no-lwt -t ${tun_type} \ + dp-${tun_type} ovs-${tun_type}0 || return 1 + + ovs_wait ip -netns trefns link show \ + ovs-${tun_type}0 >/dev/null 2>&1 || return 1 + + info "deleting dp - may hang if reference counting is broken" + ovs_sbx "${sbxname}" ip netns exec trefns \ + python3 $ovs_base/ovs-dpctl.py \ + del-dp dp-${tun_type} & + + dev_removed() { + ! ip -netns trefns link show "$1" >/dev/null 2>&1 + } + ovs_wait dev_removed dp-${tun_type} || return 1 + ovs_wait dev_removed ovs-${tun_type}0 || return 1 + done + return 0 +} + run_test() { ( tname="$1" diff --git a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py index 848f61fdcee0..bbe35e2718d2 100644 --- a/tools/testing/selftests/net/openvswitch/ovs-dpctl.py +++ b/tools/testing/selftests/net/openvswitch/ovs-dpctl.py @@ -11,7 +11,6 @@ import logging import math import multiprocessing import re -import socket import struct import sys import time @@ -2069,7 +2068,7 @@ class OvsVport(GenericNetlinkSocket): elif vport_type == "internal": return OvsVport.OVS_VPORT_TYPE_INTERNAL elif vport_type == "gre": - return OvsVport.OVS_VPORT_TYPE_INTERNAL + return OvsVport.OVS_VPORT_TYPE_GRE elif vport_type == "vxlan": return OvsVport.OVS_VPORT_TYPE_VXLAN elif vport_type == "geneve": @@ -2121,6 +2120,7 @@ class OvsVport(GenericNetlinkSocket): ) TUNNEL_DEFAULTS = [("geneve", 6081), + ("gre", 0), ("vxlan", 4789)] for tnl in TUNNEL_DEFAULTS: @@ -2129,9 +2129,13 @@ class OvsVport(GenericNetlinkSocket): dport = tnl[1] if not lwt: + if tnl[0] == "gre": + # GRE tunnels have no options. + break + vportopt = OvsVport.ovs_vport_msg.vportopts() vportopt["attrs"].append( - ["OVS_TUNNEL_ATTR_DST_PORT", socket.htons(dport)] + ["OVS_TUNNEL_ATTR_DST_PORT", dport] ) msg["attrs"].append( ["OVS_VPORT_ATTR_OPTIONS", vportopt] @@ -2145,6 +2149,9 @@ class OvsVport(GenericNetlinkSocket): geneve_port=dport, geneve_collect_metadata=True, geneve_udp_zero_csum6_rx=1) + elif tnl[0] == "gre": + ipr.link("add", ifname=vport_ifname, kind="gretap", + gre_collect_metadata=True) elif tnl[0] == "vxlan": ipr.link("add", ifname=vport_ifname, kind=tnl[0], vxlan_learning=0, vxlan_collect_metadata=1, @@ -2563,7 +2570,7 @@ def print_ovsdp_full(dp_lookup_rep, ifindex, ndb=NDB(), vpl=OvsVport()): if vpo: dpo = vpo.get_attr("OVS_TUNNEL_ATTR_DST_PORT") if dpo: - opts += " tnl-dport:%s" % socket.ntohs(dpo) + opts += " tnl-dport:%s" % dpo print( " port %d: %s (%s%s)" % ( @@ -2632,7 +2639,7 @@ def main(argv): "--ptype", type=str, default="netdev", - choices=["netdev", "internal", "geneve", "vxlan"], + choices=["netdev", "internal", "gre", "geneve", "vxlan"], help="Interface type (default netdev)", ) addifcmd.add_argument( @@ -2645,7 +2652,7 @@ def main(argv): addifcmd.add_argument( "-l", "--lwt", - type=bool, + action=argparse.BooleanOptionalAction, default=True, help="Use LWT infrastructure instead of vport (default true)." ) -- cgit v1.2.3 From 44b550d88b267320459d518c0743a241ab2108fa Mon Sep 17 00:00:00 2001 From: Nan Li Date: Fri, 1 May 2026 09:08:44 +0800 Subject: net/rds: handle zerocopy send cleanup before the message is queued A zerocopy send can fail after user pages have been pinned but before the message is attached to the sending socket. The purge path currently infers zerocopy state from rm->m_rs, so an unqueued message can be cleaned up as if it owned normal payload pages. However, zerocopy ownership is really determined by the presence of op_mmp_znotifier, regardless of whether the message has reached the socket queue. Capture op_mmp_znotifier up front in rds_message_purge() and use it as the cleanup discriminator. If the message is already associated with a socket, keep the existing completion path. Otherwise, drop the pinned page accounting directly and release the notifier before putting the payload pages. This keeps early send failure cleanup consistent with the zerocopy lifetime rules without changing the normal queued completion path. Fixes: 0cebaccef3ac ("rds: zerocopy Tx support.") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Co-developed-by: Xiao Liu Signed-off-by: Xiao Liu Signed-off-by: Nan Li Signed-off-by: Ren Wei Reviewed-by: Allison Henderson Link: https://patch.msgid.link/d2ea98a6313d5467bac00f7c9fef8c7acddb9258.1777550074.git.tonanli66@gmail.com Signed-off-by: Paolo Abeni --- net/rds/message.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/net/rds/message.c b/net/rds/message.c index eaa6f22601a4..25fedcb3cd00 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -131,24 +131,34 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs, */ static void rds_message_purge(struct rds_message *rm) { + struct rds_znotifier *znotifier; unsigned long i, flags; - bool zcopy = false; + bool zcopy; if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) return; spin_lock_irqsave(&rm->m_rs_lock, flags); + znotifier = rm->data.op_mmp_znotifier; + rm->data.op_mmp_znotifier = NULL; + zcopy = !!znotifier; + if (rm->m_rs) { struct rds_sock *rs = rm->m_rs; - if (rm->data.op_mmp_znotifier) { - zcopy = true; - rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier); + if (znotifier) { + rds_rm_zerocopy_callback(rs, znotifier); rds_wake_sk_sleep(rs); - rm->data.op_mmp_znotifier = NULL; } sock_put(rds_rs_to_sk(rs)); rm->m_rs = NULL; + } else if (znotifier) { + /* + * Zerocopy can fail before the message is queued on the + * socket, so there is no rs to carry the notification. + */ + mm_unaccount_pinned_pages(&znotifier->z_mmp); + kfree(rds_info_from_znotifier(znotifier)); } spin_unlock_irqrestore(&rm->m_rs_lock, flags); -- cgit v1.2.3 From 95084f1883a760e0d4290698346759d58e2b944a Mon Sep 17 00:00:00 2001 From: Dipayaan Roy Date: Thu, 30 Apr 2026 19:47:12 -0700 Subject: net: mana: Fix crash from unvalidated SHM offset read from BAR0 during FLR During Function Level Reset recovery, the MANA driver reads hardware BAR0 registers that may temporarily contain garbage values. The SHM (Shared Memory) offset read from GDMA_REG_SHM_OFFSET is used to compute gc->shm_base, which is later dereferenced via readl() in mana_smc_poll_register(). If the hardware returns an unaligned or out-of-range value, the driver must not blindly use it, as this would propagate the hardware error into a kernel crash. The following crash was observed on an arm64 Hyper-V guest running kernel 6.17.0-3013-azure during VF reset recovery triggered by HWC timeout. [13291.785274] Unable to handle kernel paging request at virtual address ffff8000a200001b [13291.785311] Mem abort info: [13291.785332] ESR = 0x0000000096000021 [13291.785343] EC = 0x25: DABT (current EL), IL = 32 bits [13291.785355] SET = 0, FnV = 0 [13291.785363] EA = 0, S1PTW = 0 [13291.785372] FSC = 0x21: alignment fault [13291.785382] Data abort info: [13291.785391] ISV = 0, ISS = 0x00000021, ISS2 = 0x00000000 [13291.785404] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [13291.785412] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [13291.785421] swapper pgtable: 4k pages, 48-bit VAs, pgdp=00000014df3a1000 [13291.785432] [ffff8000a200001b] pgd=1000000100438403, p4d=1000000100438403, pud=1000000100439403, pmd=0068000fc2000711 [13291.785703] Internal error: Oops: 0000000096000021 [#1] SMP [13291.830975] Modules linked in: tls qrtr mana_ib ib_uverbs ib_core xt_owner xt_tcpudp xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nft_compat nf_tables cfg80211 8021q garp mrp stp llc binfmt_misc joydev serio_raw nls_iso8859_1 hid_generic aes_ce_blk aes_ce_cipher polyval_ce ghash_ce sm4_ce_gcm sm4_ce_ccm sm4_ce sm4_ce_cipher hid_hyperv sm4 sm3_ce sha3_ce hv_netvsc hid vmgenid hyperv_keyboard hyperv_drm sch_fq_codel nvme_fabrics efi_pstore dm_multipath nfnetlink vsock_loopback vmw_vsock_virtio_transport_common hv_sock vmw_vsock_vmci_transport vmw_vmci vsock dmi_sysfs ip_tables x_tables autofs4 [13291.862630] CPU: 122 UID: 0 PID: 61796 Comm: kworker/122:2 Tainted: G W 6.17.0-3013-azure #13-Ubuntu VOLUNTARY [13291.869902] Tainted: [W]=WARN [13291.871901] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 01/08/2026 [13291.878086] Workqueue: events mana_serv_func [13291.880718] pstate: 62400005 (nZCv daif +PAN -UAO +TCO -DIT -SSBS BTYPE=--) [13291.884835] pc : mana_smc_poll_register+0x48/0xb0 [13291.887902] lr : mana_smc_setup_hwc+0x70/0x1c0 [13291.890493] sp : ffff8000ab79bbb0 [13291.892364] x29: ffff8000ab79bbb0 x28: ffff00410c8b5900 x27: ffff00410d630680 [13291.896252] x26: ffff004171f9fd80 x25: 000000016ed55000 x24: 000000017f37e000 [13291.899990] x23: 0000000000000000 x22: 000000016ed55000 x21: 0000000000000000 [13291.904497] x20: ffff8000a200001b x19: 0000000000004e20 x18: ffff8000a6183050 [13291.908308] x17: 0000000000000000 x16: 0000000000000000 x15: 000000000000000a [13291.912542] x14: 0000000000000004 x13: 0000000000000000 x12: 0000000000000000 [13291.916298] x11: 0000000000000000 x10: 0000000000000001 x9 : ffffc45006af1bd8 [13291.920945] x8 : ffff000151129000 x7 : 0000000000000000 x6 : 0000000000000000 [13291.925293] x5 : 000000015f214000 x4 : 000000017217a000 x3 : 000000016ed50000 [13291.930436] x2 : 000000016ed55000 x1 : 0000000000000000 x0 : ffff8000a1ffffff [13291.934342] Call trace: [13291.935736] mana_smc_poll_register+0x48/0xb0 (P) [13291.938611] mana_smc_setup_hwc+0x70/0x1c0 [13291.941113] mana_hwc_create_channel+0x1a0/0x3a0 [13291.944283] mana_gd_setup+0x16c/0x398 [13291.946584] mana_gd_resume+0x24/0x70 [13291.948917] mana_do_service+0x13c/0x1d0 [13291.951583] mana_serv_func+0x34/0x68 [13291.953732] process_one_work+0x168/0x3d0 [13291.956745] worker_thread+0x2ac/0x480 [13291.959104] kthread+0xf8/0x110 [13291.961026] ret_from_fork+0x10/0x20 [13291.963560] Code: d2807d00 9417c551 71000673 54000220 (b9400281) [13291.967299] ---[ end trace 0000000000000000 ]--- Disassembly of mana_smc_poll_register() around the crash site: Disassembly of section .text: 00000000000047c8 : 47c8: d503201f nop 47cc: d503201f nop 47d0: d503233f paciasp 47d4: f800865e str x30, [x18], #8 47d8: a9bd7bfd stp x29, x30, [sp, #-48]! 47dc: 910003fd mov x29, sp 47e0: a90153f3 stp x19, x20, [sp, #16] 47e4: 91007014 add x20, x0, #0x1c 47e8: 5289c413 mov w19, #0x4e20 47ec: f90013f5 str x21, [sp, #32] 47f0: 12001c35 and w21, w1, #0xff 47f4: 14000008 b 4814 47f8: 36f801e1 tbz w1, #31, 4834 47fc: 52800042 mov w2, #0x2 4800: d280fa01 mov x1, #0x7d0 4804: d2807d00 mov x0, #0x3e8 4808: 94000000 bl 0 480c: 71000673 subs w19, w19, #0x1 4810: 54000200 b.eq 4850 4814: b9400281 ldr w1, [x20] <-- **** CRASHED HERE ***** 4818: d50331bf dmb oshld 481c: 2a0103e2 mov w2, w1 ... From the crash signature x20 = ffff8000a200001b, this address ends in 0x1b which is not 4-byte aligned, so the 'ldr w1, [x20]' instruction (readl) triggers the arm64 alignment fault (FSC = 0x21). The root cause is in mana_gd_init_vf_regs(), which computes: gc->shm_base = gc->bar0_va + mana_gd_r64(gc, GDMA_REG_SHM_OFFSET); The offset is used without any validation. The same problem exists in mana_gd_init_pf_regs() for sriov_base_off and sriov_shm_off. Fix this by validating all offsets before use: - VF: check shm_off is within BAR0, properly aligned to 4 bytes (readl requirement), and leaves room for the full 256-bit (32-byte) SMC aperture. - PF: check sriov_base_off is within BAR0, aligned to 8 bytes (readq requirement), and leaves room to safely read the sriov_shm_off register at sriov_base_off + GDMA_PF_REG_SHM_OFF. Then check sriov_shm_off leaves room for the full SMC aperture. All arithmetic uses subtraction rather than addition to avoid integer overflow on garbage values. Define SMC_APERTURE_SIZE (32 bytes, derived from the 256-bit aperture width) Return -EPROTO on invalid values. The existing recovery path in mana_serv_reset() already handles -EPROTO by falling through to PCI device rescan, giving the hardware another chance to present valid register values after reset. Fixes: 9bf66036d686 ("net: mana: Handle hardware recovery events when probing the device") Signed-off-by: Dipayaan Roy Link: https://patch.msgid.link/afQUMClyjmBVfD+u@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/gdma_main.c | 40 ++++++++++++++++++++--- drivers/net/ethernet/microsoft/mana/shm_channel.c | 5 --- include/net/mana/shm_channel.h | 6 ++++ 3 files changed, 41 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 098fbda0d128..d8e816882f02 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -43,8 +43,9 @@ static u64 mana_gd_r64(struct gdma_context *g, u64 offset) static int mana_gd_init_pf_regs(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); - void __iomem *sriov_base_va; + u64 remaining_barsize; u64 sriov_base_off; + u64 sriov_shm_off; gc->db_page_size = mana_gd_r32(gc, GDMA_PF_REG_DB_PAGE_SIZE) & 0xFFFF; @@ -73,10 +74,28 @@ static int mana_gd_init_pf_regs(struct pci_dev *pdev) gc->phys_db_page_base = gc->bar0_pa + gc->db_page_off; sriov_base_off = mana_gd_r64(gc, GDMA_SRIOV_REG_CFG_BASE_OFF); + if (sriov_base_off >= gc->bar0_size || + gc->bar0_size - sriov_base_off < + GDMA_PF_REG_SHM_OFF + sizeof(u64) || + !IS_ALIGNED(sriov_base_off, sizeof(u64))) { + dev_err(gc->dev, + "SRIOV base offset 0x%llx out of range or unaligned (BAR0 size 0x%llx)\n", + sriov_base_off, (u64)gc->bar0_size); + return -EPROTO; + } - sriov_base_va = gc->bar0_va + sriov_base_off; - gc->shm_base = sriov_base_va + - mana_gd_r64(gc, sriov_base_off + GDMA_PF_REG_SHM_OFF); + remaining_barsize = gc->bar0_size - sriov_base_off; + sriov_shm_off = mana_gd_r64(gc, sriov_base_off + GDMA_PF_REG_SHM_OFF); + if (sriov_shm_off >= remaining_barsize || + remaining_barsize - sriov_shm_off < SMC_APERTURE_SIZE || + !IS_ALIGNED(sriov_shm_off, sizeof(u32))) { + dev_err(gc->dev, + "SRIOV SHM offset 0x%llx out of range or unaligned (BAR0 size 0x%llx)\n", + sriov_shm_off, (u64)gc->bar0_size); + return -EPROTO; + } + + gc->shm_base = gc->bar0_va + sriov_base_off + sriov_shm_off; return 0; } @@ -84,6 +103,7 @@ static int mana_gd_init_pf_regs(struct pci_dev *pdev) static int mana_gd_init_vf_regs(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); + u64 shm_off; gc->db_page_size = mana_gd_r32(gc, GDMA_REG_DB_PAGE_SIZE) & 0xFFFF; @@ -111,7 +131,17 @@ static int mana_gd_init_vf_regs(struct pci_dev *pdev) gc->db_page_base = gc->bar0_va + gc->db_page_off; gc->phys_db_page_base = gc->bar0_pa + gc->db_page_off; - gc->shm_base = gc->bar0_va + mana_gd_r64(gc, GDMA_REG_SHM_OFFSET); + shm_off = mana_gd_r64(gc, GDMA_REG_SHM_OFFSET); + if (shm_off >= gc->bar0_size || + gc->bar0_size - shm_off < SMC_APERTURE_SIZE || + !IS_ALIGNED(shm_off, sizeof(u32))) { + dev_err(gc->dev, + "SHM offset 0x%llx out of range or unaligned (BAR0 size 0x%llx)\n", + shm_off, (u64)gc->bar0_size); + return -EPROTO; + } + + gc->shm_base = gc->bar0_va + shm_off; return 0; } diff --git a/drivers/net/ethernet/microsoft/mana/shm_channel.c b/drivers/net/ethernet/microsoft/mana/shm_channel.c index 0f1679ebad96..d21b5db06e50 100644 --- a/drivers/net/ethernet/microsoft/mana/shm_channel.c +++ b/drivers/net/ethernet/microsoft/mana/shm_channel.c @@ -61,11 +61,6 @@ union smc_proto_hdr { }; }; /* HW DATA */ -#define SMC_APERTURE_BITS 256 -#define SMC_BASIC_UNIT (sizeof(u32)) -#define SMC_APERTURE_DWORDS (SMC_APERTURE_BITS / (SMC_BASIC_UNIT * 8)) -#define SMC_LAST_DWORD (SMC_APERTURE_DWORDS - 1) - static int mana_smc_poll_register(void __iomem *base, bool reset) { void __iomem *ptr = base + SMC_LAST_DWORD * SMC_BASIC_UNIT; diff --git a/include/net/mana/shm_channel.h b/include/net/mana/shm_channel.h index 5199b41497ff..dbabcfb95daf 100644 --- a/include/net/mana/shm_channel.h +++ b/include/net/mana/shm_channel.h @@ -4,6 +4,12 @@ #ifndef _SHM_CHANNEL_H #define _SHM_CHANNEL_H +#define SMC_APERTURE_BITS 256 +#define SMC_BASIC_UNIT (sizeof(u32)) +#define SMC_APERTURE_DWORDS (SMC_APERTURE_BITS / (SMC_BASIC_UNIT * 8)) +#define SMC_LAST_DWORD (SMC_APERTURE_DWORDS - 1) +#define SMC_APERTURE_SIZE (SMC_APERTURE_BITS / 8) + struct shm_channel { struct device *dev; void __iomem *base; -- cgit v1.2.3 From ac8eb3e18f41e2cc8492cc1d358bcb786c850270 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Tue, 5 May 2026 15:15:40 +0200 Subject: wifi: mac80211: use safe list iteration in radar detect work The call to ieee80211_dfs_cac_cancel can cause the iterated chanctx to be freed and removed from the list. Guard against this to avoid a slab-use-after-free error. Cc: stable@vger.kernel.org Fixes: bca8bc0399ac ("wifi: mac80211: handle ieee80211_radar_detected() for MLO") Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20260505151539.236d63a1b736.I35dbb9e96a2d4a480be208770fdd99ba3b817b79@changeid Signed-off-by: Johannes Berg --- net/mac80211/util.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index b093bc203c81..2529b01e2cd5 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3700,11 +3700,11 @@ void ieee80211_dfs_radar_detected_work(struct wiphy *wiphy, struct ieee80211_local *local = container_of(work, struct ieee80211_local, radar_detected_work); struct cfg80211_chan_def chandef; - struct ieee80211_chanctx *ctx; + struct ieee80211_chanctx *ctx, *tmp; lockdep_assert_wiphy(local->hw.wiphy); - list_for_each_entry(ctx, &local->chanctx_list, list) { + list_for_each_entry_safe(ctx, tmp, &local->chanctx_list, list) { if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) continue; -- cgit v1.2.3 From 901a7d9e2f280a9e76e6c58406a519cb11ad5ff8 Mon Sep 17 00:00:00 2001 From: Alyssa Ross Date: Sun, 3 May 2026 21:25:16 +0200 Subject: ipv6: default IPV6_SIT to m This basically defaulted to m until recently, since IPV6 defaulted to m. Since IPV6 was changed to a boolean with a default of y, IPV6_SIT started defaulting to built-in as well. This results in a surprise sit0 device by default for defconfig (and defconfig-derived config) users at boot. For me, this broke an (admittedly non-robust) script. Preserve the behaviour of most configs by avoiding building this module, that's probably overall seldom used compared to IPv6 as a whole, into the kernel. Fixes: 309b905deee59 ("ipv6: convert CONFIG_IPV6 to built-in only and clean up Kconfigs") Signed-off-by: Alyssa Ross Reviewed-by: Fernando Fernandez Mancera Link: https://patch.msgid.link/20260503192515.290900-2-hi@alyssa.is Signed-off-by: Jakub Kicinski --- net/ipv6/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index c024aa77f25b..c3806c6ac96f 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -164,7 +164,7 @@ config IPV6_SIT select INET_TUNNEL select NET_IP_TUNNEL select IPV6_NDISC_NODETYPE - default y + default m help Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the @@ -172,7 +172,7 @@ config IPV6_SIT into IPv4 packets. This is useful if you want to connect two IPv6 networks over an IPv4-only path. - Saying M here will produce a module called sit. If unsure, say Y. + Saying M here will produce a module called sit. If unsure, say M. config IPV6_SIT_6RD bool "IPv6: IPv6 Rapid Deployment (6RD)" -- cgit v1.2.3 From 5ad509c1fdad4bf0993b72d1b3d462f036d8a0d8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 4 May 2026 06:43:13 +0000 Subject: ipv6: Fix null-ptr-deref in fib6_mtu(). syzbot reported null-ptr-deref in fib6_mtu(). [0] When res->f6i->fib6_pmtu is 0 in fib6_mtu(), it fetches MTU from __in6_dev_get(nh->fib_nh_dev)->cnf.mtu6. However, __in6_dev_get() could return NULL when the device is being unregistered. Let's return 0 MTU if __in6_dev_get() returns NULL in fib6_mtu(). [0]: Oops: general protection fault, probably for non-canonical address 0xdffffc00000000bc: 0000 [#1] SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x00000000000005e0-0x00000000000005e7] CPU: 0 UID: 0 PID: 7890 Comm: syz.2.502 Tainted: G L syzkaller #0 PREEMPT(full) Tainted: [L]=SOFTLOCKUP Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:fib6_mtu net/ipv6/route.c:1648 [inline] RIP: 0010:rt6_insert_exception+0x9eb/0x10a0 net/ipv6/route.c:1753 Code: 3b 14 cf f7 45 85 f6 0f 85 1d 02 00 00 e8 7d 19 cf f7 48 8d bb e0 05 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 89 RSP: 0000:ffffc9000610f120 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffc9000c001000 RDX: 00000000000000bc RSI: ffffffff8a38bc83 RDI: 00000000000005e0 RBP: ffff888052f06000 R08: 0000000000000005 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: ffff888042d16c00 R13: ffff888042d16cc8 R14: 0000000000000001 R15: 0000000000000500 FS: 0000000000000000(0000) GS:ffff88809717d000(0063) knlGS:00000000f540db40 CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 CR2: 00000000f73c6d50 CR3: 000000006eff0000 CR4: 0000000000352ef0 Call Trace: __ip6_rt_update_pmtu+0x555/0xd60 net/ipv6/route.c:2982 ip6_update_pmtu+0x34f/0x3b0 net/ipv6/route.c:3014 icmpv6_err+0x2a2/0x3f0 net/ipv6/icmp.c:82 icmpv6_notify+0x35e/0x820 net/ipv6/icmp.c:1087 icmpv6_rcv+0x10bf/0x1ae0 net/ipv6/icmp.c:1228 ip6_protocol_deliver_rcu+0xf97/0x1500 net/ipv6/ip6_input.c:478 ip6_input_finish+0x1e4/0x4a0 net/ipv6/ip6_input.c:529 NF_HOOK include/linux/netfilter.h:318 [inline] NF_HOOK include/linux/netfilter.h:312 [inline] ip6_input+0x105/0x2f0 net/ipv6/ip6_input.c:540 ip6_mc_input+0x513/0xf50 net/ipv6/ip6_input.c:630 dst_input include/net/dst.h:480 [inline] ip6_rcv_finish net/ipv6/ip6_input.c:119 [inline] NF_HOOK include/linux/netfilter.h:318 [inline] NF_HOOK include/linux/netfilter.h:312 [inline] ipv6_rcv+0x34c/0x3d0 net/ipv6/ip6_input.c:351 __netif_receive_skb_one_core+0x12d/0x1e0 net/core/dev.c:6202 __netif_receive_skb+0x1f/0x120 net/core/dev.c:6315 netif_receive_skb_internal net/core/dev.c:6401 [inline] netif_receive_skb+0x13b/0x7f0 net/core/dev.c:6460 tun_rx_batched.isra.0+0x3f6/0x750 drivers/net/tun.c:1511 tun_get_user+0x1e31/0x3c20 drivers/net/tun.c:1955 tun_chr_write_iter+0xdc/0x200 drivers/net/tun.c:2001 new_sync_write fs/read_write.c:595 [inline] vfs_write+0x6ac/0x1070 fs/read_write.c:688 ksys_write+0x12a/0x250 fs/read_write.c:740 do_syscall_32_irqs_on arch/x86/entry/syscall_32.c:83 [inline] do_int80_emulation+0x141/0x700 arch/x86/entry/syscall_32.c:172 asm_int80_emulation+0x1a/0x20 arch/x86/include/asm/idtentry.h:621 RIP: 0023:0xf715616b Code: 57 56 53 8b 44 24 14 f6 00 08 75 23 8b 44 24 18 8b 5c 24 1c 8b 4c 24 20 8b 54 24 24 8b 74 24 28 8b 7c 24 2c 8b 6c 24 30 cd 80 <5b> 5e 5f 5d c3 5b 5e 5f 5d e9 f7 a1 ff ff 66 90 66 90 66 90 90 53 RSP: 002b:00000000f540d44c EFLAGS: 00000246 ORIG_RAX: 0000000000000004 RAX: ffffffffffffffda RBX: 00000000000000c8 RCX: 0000000080000640 RDX: 000000000000007a RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000292 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Fixes: dcd1f572954f ("net/ipv6: Remove fib6_idev") Reported-by: syzbot+01f005f9c6387ca6f6dd@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/69f83f22.170a0220.13cc2.0004.GAE@google.com/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260504064316.3820775-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0dc0316530ca..e3d355d1fbd6 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1645,6 +1645,10 @@ static unsigned int fib6_mtu(const struct fib6_result *res) rcu_read_lock(); idev = __in6_dev_get(dev); + if (!idev) { + rcu_read_unlock(); + return 0; + } mtu = READ_ONCE(idev->cnf.mtu6); rcu_read_unlock(); } -- cgit v1.2.3 From 07f44433355f70fa97d4c44b4c0d2e86adc082fb Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 4 May 2026 14:06:08 +0530 Subject: bnxt_en: Delay for 5 seconds after AER DPC for all chips The FW on all chips is requiring a 5-second delay after Downstream Port Containment (DPC) AER. The previously added 900 msec delay was not long enough in all cases because the chip's CRS (Configuration Request Retry Status) mechanism is not always reliable. Fixes: d5ab32e9b02d ("bnxt_en: Add delay to handle Downstream Port Containment (DPC) AER") Reviewed-by: Kalesh AP Signed-off-by: Michael Chan Signed-off-by: Pavan Chebbi Link: https://patch.msgid.link/20260504083611.1383776-2-pavan.chebbi@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 8c55874f44ca..3db951d0c690 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -17360,9 +17360,14 @@ static pci_ers_result_t bnxt_io_slot_reset(struct pci_dev *pdev) netdev_info(bp->dev, "PCI Slot Reset\n"); - if (!(bp->flags & BNXT_FLAG_CHIP_P5_PLUS) && - test_bit(BNXT_STATE_PCI_CHANNEL_IO_FROZEN, &bp->state)) - msleep(900); + if (test_bit(BNXT_STATE_PCI_CHANNEL_IO_FROZEN, &bp->state)) { + /* After DPC, the chip should return CRS when the vendor ID + * config register is read until it is ready. On all chips, + * this is not happening reliably so add a 5-second delay as a + * workaround. + */ + msleep(5000); + } netdev_lock(netdev); -- cgit v1.2.3 From 54c28fab2fa5afd681c9c4b10f4f6da1efdd397a Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 4 May 2026 14:06:09 +0530 Subject: bnxt_en: Set bp->max_tpa according to what the FW supports Fix the logic to set bp->max_tpa no higher than what the FW supports. On P5 chips, some older FW sets max_tpa very low so we override it to prevent performance regressions with the older FW. Fixes: 79632e9ba386 ("bnxt_en: Expand bnxt_tpa_info struct to support 57500 chips.") Reviewed-by: Kalesh AP Reviewed-by: Colin Winegarden Reviewed-by: Rukhsana Ansari Signed-off-by: Michael Chan Signed-off-by: Pavan Chebbi Link: https://patch.msgid.link/20260504083611.1383776-3-pavan.chebbi@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 3db951d0c690..008c34cff7b4 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3825,7 +3825,10 @@ static int bnxt_alloc_tpa_info(struct bnxt *bp) if (bp->flags & BNXT_FLAG_CHIP_P5_PLUS) { if (!bp->max_tpa_v2) return 0; - bp->max_tpa = max_t(u16, bp->max_tpa_v2, MAX_TPA_P5); + bp->max_tpa = min_t(u16, bp->max_tpa_v2, MAX_TPA_P5); + /* Older P5 FW sets max_tpa_v2 low by mistake except NPAR */ + if (bp->max_tpa <= 32 && BNXT_CHIP_P5(bp) && !BNXT_NPAR(bp)) + bp->max_tpa = MAX_TPA_P5; } for (i = 0; i < bp->rx_nr_rings; i++) { -- cgit v1.2.3 From 16517bc98a56004274472cc9949194cb4d2ad0b7 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Mon, 4 May 2026 14:06:10 +0530 Subject: bnxt_en: Check return value of bnxt_hwrm_vnic_cfg When the bnxt RDMA driver is loaded, it calls bnxt_register_dev(). As part of this, driver sends HWRM_VNIC_CFG firmware command to configure the VNIC to operate in dual VNIC mode. Currently the driver ignores the result of this firmware command. The RDMA driver must know the result since it affects its functioning. Check return value of call to bnxt_hwrm_vnic_cfg() in bnxt_register_dev() and return failure on error. Fixes: a588e4580a7e ("bnxt_en: Add interface to support RDMA driver.") Reviewed-by: Michael Chan Signed-off-by: Kalesh AP Signed-off-by: Pavan Chebbi Link: https://patch.msgid.link/20260504083611.1383776-4-pavan.chebbi@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index 052bf69cfa4c..5c751933da6a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -175,8 +175,14 @@ int bnxt_register_dev(struct bnxt_en_dev *edev, ulp->handle = handle; rcu_assign_pointer(ulp->ulp_ops, ulp_ops); - if (test_bit(BNXT_STATE_OPEN, &bp->state)) - bnxt_hwrm_vnic_cfg(bp, &bp->vnic_info[BNXT_VNIC_DEFAULT]); + if (test_bit(BNXT_STATE_OPEN, &bp->state)) { + rc = bnxt_hwrm_vnic_cfg(bp, &bp->vnic_info[BNXT_VNIC_DEFAULT]); + if (rc) { + netdev_err(dev, "Failed to configure dual VNIC mode\n"); + RCU_INIT_POINTER(ulp->ulp_ops, NULL); + goto exit; + } + } edev->ulp_tbl->msix_requested = bnxt_get_ulp_msix_num(bp); -- cgit v1.2.3 From bd279e104e5f5400307d56116a36756b35ab345a Mon Sep 17 00:00:00 2001 From: Pavan Chebbi Date: Mon, 4 May 2026 14:06:11 +0530 Subject: bnxt_en: Use absolute target ns from ptp_clock_request There is no need to calculate the target PHC cycles required to make phase adjustment on the PPS OUT signal. This is because the application supplies absolute n_sec value in the future and is already the actual desired target value. Remove the unnecessary code. Fixes: 9e518f25802c ("bnxt_en: 1PPS functions to configure TSIO pins") Reviewed-by: Kalesh AP Cc: Richard Cochran Signed-off-by: Pavan Chebbi Reviewed-by: Vadim Fedorenko Tested-by: Vadim Fedorenko Link: https://patch.msgid.link/20260504083611.1383776-5-pavan.chebbi@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c | 29 +++++---------------------- 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c index 53f336db4fcc..5d41dc1bc782 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c @@ -419,31 +419,13 @@ void bnxt_ptp_reapply_pps(struct bnxt *bp) } } -static int bnxt_get_target_cycles(struct bnxt_ptp_cfg *ptp, u64 target_ns, - u64 *cycles_delta) -{ - u64 cycles_now; - u64 nsec_now, nsec_delta; - int rc; - - rc = bnxt_refclk_read(ptp->bp, NULL, &cycles_now); - if (rc) - return rc; - - nsec_now = bnxt_timecounter_cyc2time(ptp, cycles_now); - - nsec_delta = target_ns - nsec_now; - *cycles_delta = div64_u64(nsec_delta << ptp->cc.shift, ptp->cc.mult); - return 0; -} - static int bnxt_ptp_perout_cfg(struct bnxt_ptp_cfg *ptp, struct ptp_clock_request *rq) { struct hwrm_func_ptp_cfg_input *req; struct bnxt *bp = ptp->bp; struct timespec64 ts; - u64 target_ns, delta; + u64 target_ns; u16 enables; int rc; @@ -451,10 +433,6 @@ static int bnxt_ptp_perout_cfg(struct bnxt_ptp_cfg *ptp, ts.tv_nsec = rq->perout.start.nsec; target_ns = timespec64_to_ns(&ts); - rc = bnxt_get_target_cycles(ptp, target_ns, &delta); - if (rc) - return rc; - rc = hwrm_req_init(bp, req, HWRM_FUNC_PTP_CFG); if (rc) return rc; @@ -468,7 +446,10 @@ static int bnxt_ptp_perout_cfg(struct bnxt_ptp_cfg *ptp, req->ptp_freq_adj_dll_phase = 0; req->ptp_freq_adj_ext_period = cpu_to_le32(NSEC_PER_SEC); req->ptp_freq_adj_ext_up = 0; - req->ptp_freq_adj_ext_phase_lower = cpu_to_le32(delta); + req->ptp_freq_adj_ext_phase_lower = + cpu_to_le32(lower_32_bits(target_ns)); + req->ptp_freq_adj_ext_phase_upper = + cpu_to_le32(upper_32_bits(target_ns)); return hwrm_req_send(bp, req); } -- cgit v1.2.3 From f83e07b29246f468bc7c99f98ca1897843fa8167 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 May 2026 16:38:42 +0000 Subject: net/sched: sch_fq_codel: annotate data-races from fq_codel_dump_class_stats() fq_codel_dump_class_stats() acquires qdisc spinlock only when requested to follow flow->head chain. As we did in sch_cake recently, add the missing READ_ONCE()/WRITE_ONCE() annotations. Fixes: edb09eb17ed8 ("net: sched: do not acquire qdisc spinlock in qdisc/class stats dump") Signed-off-by: Eric Dumazet Reviewed-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260504163842.1162001-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_fq_codel.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 0664b2f2d6f2..24db54684e8a 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -117,7 +117,7 @@ static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow) { struct sk_buff *skb = flow->head; - flow->head = skb->next; + WRITE_ONCE(flow->head, skb->next); skb_mark_not_on_list(skb); return skb; } @@ -127,7 +127,7 @@ static inline void flow_queue_add(struct fq_codel_flow *flow, struct sk_buff *skb) { if (flow->head == NULL) - flow->head = skb; + WRITE_ONCE(flow->head, skb); else flow->tail->next = skb; flow->tail = skb; @@ -173,8 +173,8 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets, } while (++i < max_packets && len < threshold); /* Tell codel to increase its signal strength also */ - flow->cvars.count += i; - q->backlogs[idx] -= len; + WRITE_ONCE(flow->cvars.count, flow->cvars.count + i); + WRITE_ONCE(q->backlogs[idx], q->backlogs[idx] - len); q->memory_usage -= mem; sch->qstats.drops += i; sch->qstats.backlog -= len; @@ -204,13 +204,13 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch, codel_set_enqueue_time(skb); flow = &q->flows[idx]; flow_queue_add(flow, skb); - q->backlogs[idx] += qdisc_pkt_len(skb); + WRITE_ONCE(q->backlogs[idx], q->backlogs[idx] + qdisc_pkt_len(skb)); qdisc_qstats_backlog_inc(sch, skb); if (list_empty(&flow->flowchain)) { list_add_tail(&flow->flowchain, &q->new_flows); q->new_flow_count++; - flow->deficit = q->quantum; + WRITE_ONCE(flow->deficit, q->quantum); } get_codel_cb(skb)->mem_usage = skb->truesize; q->memory_usage += get_codel_cb(skb)->mem_usage; @@ -263,7 +263,8 @@ static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx) flow = container_of(vars, struct fq_codel_flow, cvars); if (flow->head) { skb = dequeue_head(flow); - q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb); + WRITE_ONCE(q->backlogs[flow - q->flows], + q->backlogs[flow - q->flows] - qdisc_pkt_len(skb)); q->memory_usage -= get_codel_cb(skb)->mem_usage; sch->q.qlen--; sch->qstats.backlog -= qdisc_pkt_len(skb); @@ -296,7 +297,7 @@ begin: flow = list_first_entry(head, struct fq_codel_flow, flowchain); if (flow->deficit <= 0) { - flow->deficit += q->quantum; + WRITE_ONCE(flow->deficit, flow->deficit + q->quantum); list_move_tail(&flow->flowchain, &q->old_flows); goto begin; } @@ -314,7 +315,7 @@ begin: goto begin; } qdisc_bstats_update(sch, skb); - flow->deficit -= qdisc_pkt_len(skb); + WRITE_ONCE(flow->deficit, flow->deficit - qdisc_pkt_len(skb)); if (q->cstats.drop_count) { qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, @@ -328,7 +329,7 @@ begin: static void fq_codel_flow_purge(struct fq_codel_flow *flow) { rtnl_kfree_skbs(flow->head, flow->tail); - flow->head = NULL; + WRITE_ONCE(flow->head, NULL); } static void fq_codel_reset(struct Qdisc *sch) @@ -656,21 +657,21 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, memset(&xstats, 0, sizeof(xstats)); xstats.type = TCA_FQ_CODEL_XSTATS_CLASS; - xstats.class_stats.deficit = flow->deficit; + xstats.class_stats.deficit = READ_ONCE(flow->deficit); xstats.class_stats.ldelay = - codel_time_to_us(flow->cvars.ldelay); - xstats.class_stats.count = flow->cvars.count; - xstats.class_stats.lastcount = flow->cvars.lastcount; - xstats.class_stats.dropping = flow->cvars.dropping; - if (flow->cvars.dropping) { - codel_tdiff_t delta = flow->cvars.drop_next - + codel_time_to_us(READ_ONCE(flow->cvars.ldelay)); + xstats.class_stats.count = READ_ONCE(flow->cvars.count); + xstats.class_stats.lastcount = READ_ONCE(flow->cvars.lastcount); + xstats.class_stats.dropping = READ_ONCE(flow->cvars.dropping); + if (xstats.class_stats.dropping) { + codel_tdiff_t delta = READ_ONCE(flow->cvars.drop_next) - codel_get_time(); xstats.class_stats.drop_next = (delta >= 0) ? codel_time_to_us(delta) : -codel_time_to_us(-delta); } - if (flow->head) { + if (READ_ONCE(flow->head)) { sch_tree_lock(sch); skb = flow->head; while (skb) { @@ -679,7 +680,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, } sch_tree_unlock(sch); } - qs.backlog = q->backlogs[idx]; + qs.backlog = READ_ONCE(q->backlogs[idx]); qs.drops = 0; } if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) -- cgit v1.2.3 From 0e7c074cfcd9bd93765505f9eb8b42f03ed2a744 Mon Sep 17 00:00:00 2001 From: Pavitra Jha Date: Fri, 1 May 2026 07:07:12 -0400 Subject: net: wwan: t7xx: validate port_count against message length in t7xx_port_enum_msg_handler t7xx_port_enum_msg_handler() uses the modem-supplied port_count field as a loop bound over port_msg->data[] without checking that the message buffer contains sufficient data. A modem sending port_count=65535 in a 12-byte buffer triggers a slab-out-of-bounds read of up to 262140 bytes. Add a sizeof(*port_msg) check before accessing the port message header fields to guard against undersized messages. Add a struct_size() check after extracting port_count and before the loop. In t7xx_parse_host_rt_data(), guard the rt_feature header read with a remaining-buffer check before accessing data_len, validate feat_data_len against the actual remaining buffer to prevent OOB reads and signed integer overflow on offset. Pass msg_len from both call sites: skb->len at the DPMAIF path after skb_pull(), and the validated feat_data_len at the handshake path. Fixes: da45d2566a1d ("net: wwan: t7xx: Add control port") Cc: stable@vger.kernel.org Signed-off-by: Pavitra Jha Link: https://patch.msgid.link/20260501110713.145563-1-jhapavitra98@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/wwan/t7xx/t7xx_modem_ops.c | 20 +++++++++++++++++--- drivers/net/wwan/t7xx/t7xx_port_ctrl_msg.c | 18 ++++++++++++++++-- drivers/net/wwan/t7xx/t7xx_port_proxy.h | 2 +- 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/drivers/net/wwan/t7xx/t7xx_modem_ops.c b/drivers/net/wwan/t7xx/t7xx_modem_ops.c index 7968e208dd37..adb29d30c63f 100644 --- a/drivers/net/wwan/t7xx/t7xx_modem_ops.c +++ b/drivers/net/wwan/t7xx/t7xx_modem_ops.c @@ -457,8 +457,20 @@ static int t7xx_parse_host_rt_data(struct t7xx_fsm_ctl *ctl, struct t7xx_sys_inf offset = sizeof(struct feature_query); for (i = 0; i < FEATURE_COUNT && offset < data_length; i++) { + size_t remaining = data_length - offset; + size_t feat_data_len, feat_total; + + if (remaining < sizeof(*rt_feature)) + break; + rt_feature = data + offset; - offset += sizeof(*rt_feature) + le32_to_cpu(rt_feature->data_len); + feat_data_len = le32_to_cpu(rt_feature->data_len); + + if (feat_data_len > remaining - sizeof(*rt_feature)) + break; + + feat_total = sizeof(*rt_feature) + feat_data_len; + offset += feat_total; ft_spt_cfg = FIELD_GET(FEATURE_MSK, core->feature_set[i]); if (ft_spt_cfg != MTK_FEATURE_MUST_BE_SUPPORTED) @@ -468,8 +480,10 @@ static int t7xx_parse_host_rt_data(struct t7xx_fsm_ctl *ctl, struct t7xx_sys_inf if (ft_spt_st != MTK_FEATURE_MUST_BE_SUPPORTED) return -EINVAL; - if (i == RT_ID_MD_PORT_ENUM || i == RT_ID_AP_PORT_ENUM) - t7xx_port_enum_msg_handler(ctl->md, rt_feature->data); + if (i == RT_ID_MD_PORT_ENUM || i == RT_ID_AP_PORT_ENUM) { + t7xx_port_enum_msg_handler(ctl->md, rt_feature->data, + feat_data_len); + } } return 0; diff --git a/drivers/net/wwan/t7xx/t7xx_port_ctrl_msg.c b/drivers/net/wwan/t7xx/t7xx_port_ctrl_msg.c index ae632ef96698..f869e4ed9ee9 100644 --- a/drivers/net/wwan/t7xx/t7xx_port_ctrl_msg.c +++ b/drivers/net/wwan/t7xx/t7xx_port_ctrl_msg.c @@ -117,6 +117,7 @@ static int fsm_ee_message_handler(struct t7xx_port *port, struct t7xx_fsm_ctl *c * t7xx_port_enum_msg_handler() - Parse the port enumeration message to create/remove nodes. * @md: Modem context. * @msg: Message. + * @msg_len: Length of @msg in bytes. * * Used to control create/remove device node. * @@ -124,12 +125,18 @@ static int fsm_ee_message_handler(struct t7xx_port *port, struct t7xx_fsm_ctl *c * * 0 - Success. * * -EFAULT - Message check failure. */ -int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg) +int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg, size_t msg_len) { struct device *dev = &md->t7xx_dev->pdev->dev; unsigned int version, port_count, i; struct port_msg *port_msg = msg; + if (msg_len < sizeof(*port_msg)) { + dev_err(dev, "Port enum msg too short for header: need %zu, have %zu\n", + sizeof(*port_msg), msg_len); + return -EINVAL; + } + version = FIELD_GET(PORT_MSG_VERSION, le32_to_cpu(port_msg->info)); if (version != PORT_ENUM_VER || le32_to_cpu(port_msg->head_pattern) != PORT_ENUM_HEAD_PATTERN || @@ -141,6 +148,13 @@ int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg) } port_count = FIELD_GET(PORT_MSG_PRT_CNT, le32_to_cpu(port_msg->info)); + + if (msg_len < struct_size(port_msg, data, port_count)) { + dev_err(dev, "Port enum msg too short: need %zu, have %zu\n", + struct_size(port_msg, data, port_count), msg_len); + return -EINVAL; + } + for (i = 0; i < port_count; i++) { u32 port_info = le32_to_cpu(port_msg->data[i]); unsigned int ch_id; @@ -191,7 +205,7 @@ static int control_msg_handler(struct t7xx_port *port, struct sk_buff *skb) case CTL_ID_PORT_ENUM: skb_pull(skb, sizeof(*ctrl_msg_h)); - ret = t7xx_port_enum_msg_handler(ctl->md, (struct port_msg *)skb->data); + ret = t7xx_port_enum_msg_handler(ctl->md, (struct port_msg *)skb->data, skb->len); if (!ret) ret = port_ctl_send_msg_to_md(port, CTL_ID_PORT_ENUM, 0); else diff --git a/drivers/net/wwan/t7xx/t7xx_port_proxy.h b/drivers/net/wwan/t7xx/t7xx_port_proxy.h index f0918b36e899..7c3190bf0fcf 100644 --- a/drivers/net/wwan/t7xx/t7xx_port_proxy.h +++ b/drivers/net/wwan/t7xx/t7xx_port_proxy.h @@ -103,7 +103,7 @@ void t7xx_port_proxy_reset(struct port_proxy *port_prox); void t7xx_port_proxy_uninit(struct port_proxy *port_prox); int t7xx_port_proxy_init(struct t7xx_modem *md); void t7xx_port_proxy_md_status_notify(struct port_proxy *port_prox, unsigned int state); -int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg); +int t7xx_port_enum_msg_handler(struct t7xx_modem *md, void *msg, size_t msg_len); int t7xx_port_proxy_chl_enable_disable(struct port_proxy *port_prox, unsigned int ch_id, bool en_flag); void t7xx_port_proxy_set_cfg(struct t7xx_modem *md, enum port_cfg_id cfg_id); -- cgit v1.2.3 From ae9582cd0b9ccc4a121af300df68fd27f72e9822 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Mon, 4 May 2026 21:10:58 +0300 Subject: net/mlx5e: psp: Fix invalid access on PSP dev registration fail priv->psp->psp is initialized with the PSP device as returned by psp_dev_create(). This could also return an error, in which case a future psp_dev_unregister() will result in unpleasantness. Avoid that by using a local variable and only saving the PSP device when registration succeeds. In case psp_dev_create() fails, priv->psp and steering structs are left in place, but they will be inert. The unchecked access of priv->psp in mlx5e_psp_offload_handle_rx_skb() won't happen because without a PSP device, there can be no SAs added and therefore no packets will be successfully decrypted and be handed off to the SW handler. Fixes: 89ee2d92f66c ("net/mlx5e: Support PSP offload functionality") Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260504181100.269334-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en_accel/psp.c | 26 ++++++++++++++-------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/psp.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/psp.c index 6a50b6dec0fa..1ff818fb48df 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/psp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/psp.c @@ -1070,29 +1070,37 @@ static struct psp_dev_ops mlx5_psp_ops = { void mlx5e_psp_unregister(struct mlx5e_priv *priv) { - if (!priv->psp || !priv->psp->psp) + struct mlx5e_psp *psp = priv->psp; + + if (!psp || !psp->psp) return; - psp_dev_unregister(priv->psp->psp); + psp_dev_unregister(psp->psp); + psp->psp = NULL; } void mlx5e_psp_register(struct mlx5e_priv *priv) { + struct mlx5e_psp *psp = priv->psp; + struct psp_dev *psd; + /* FW Caps missing */ if (!priv->psp) return; - priv->psp->caps.assoc_drv_spc = sizeof(u32); - priv->psp->caps.versions = 1 << PSP_VERSION_HDR0_AES_GCM_128; + psp->caps.assoc_drv_spc = sizeof(u32); + psp->caps.versions = 1 << PSP_VERSION_HDR0_AES_GCM_128; if (MLX5_CAP_PSP(priv->mdev, psp_crypto_esp_aes_gcm_256_encrypt) && MLX5_CAP_PSP(priv->mdev, psp_crypto_esp_aes_gcm_256_decrypt)) - priv->psp->caps.versions |= 1 << PSP_VERSION_HDR0_AES_GCM_256; + psp->caps.versions |= 1 << PSP_VERSION_HDR0_AES_GCM_256; - priv->psp->psp = psp_dev_create(priv->netdev, &mlx5_psp_ops, - &priv->psp->caps, NULL); - if (IS_ERR(priv->psp->psp)) + psd = psp_dev_create(priv->netdev, &mlx5_psp_ops, &psp->caps, NULL); + if (IS_ERR(psd)) { mlx5_core_err(priv->mdev, "PSP failed to register due to %pe\n", - priv->psp->psp); + psd); + return; + } + psp->psp = psd; } int mlx5e_psp_init(struct mlx5e_priv *priv) -- cgit v1.2.3 From 50690733db59fbb3de9fa811b606af324eeb4e37 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Mon, 4 May 2026 21:10:59 +0300 Subject: net/mlx5e: psp: Expose only a fully initialized priv->psp Currently, during PSP init, priv->psp is initialized to an incompletely built psp struct. Additionally, on fs init failure priv->psp is reset to NULL. Change this so that only a fully initialized priv->psp is set, which makes the code easier to reason about in failure scenarios. Fixes: af2196f49480 ("net/mlx5e: Implement PSP operations .assoc_add and .assoc_del") Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260504181100.269334-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/psp.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/psp.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/psp.c index 1ff818fb48df..d9adb993e64d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/psp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/psp.c @@ -1139,22 +1139,18 @@ int mlx5e_psp_init(struct mlx5e_priv *priv) if (!psp) return -ENOMEM; - priv->psp = psp; fs = mlx5e_accel_psp_fs_init(priv); if (IS_ERR(fs)) { err = PTR_ERR(fs); - goto out_err; + kfree(psp); + return err; } psp->fs = fs; + priv->psp = psp; mlx5_core_dbg(priv->mdev, "PSP attached to netdevice\n"); return 0; - -out_err: - priv->psp = NULL; - kfree(psp); - return err; } void mlx5e_psp_cleanup(struct mlx5e_priv *priv) -- cgit v1.2.3 From c4a5c46199b5addf0157934da3aa89c33eb02a6d Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Mon, 4 May 2026 21:11:00 +0300 Subject: net/mlx5e: psp: Hook PSP dev reg/unreg to profile enable/disable devlink reload while PSP connections are active does: mlx5_unload_one_devl_locked() -> mlx5_detach_device() -> _mlx5e_suspend() -> mlx5e_detach_netdev() -> profile->cleanup_rx -> profile->cleanup_tx -> mlx5e_destroy_mdev_resources() -> mlx5_core_dealloc_pd() fails: ... mlx5_core 0000:08:00.0: mlx5_cmd_out_err:821:(pid 19722): DEALLOC_PD(0x801) op_mod(0x0) failed, status bad resource state(0x9), syndrome (0xef0c8a), err(-22) ... The reason for failure is the existence of TX keys, which are removed by the PSP dev unregistration happening in: profile->cleanup() -> mlx5e_psp_unregister() -> mlx5e_psp_cleanup() -> psp_dev_unregister() ...but this isn't invoked in the devlink reload flow, only when changing the NIC profile (e.g. when transitioning to switchdev mode) or on dev teardown. Move PSP device registration into mlx5e_nic_enable(), and unregistration into the corresponding mlx5e_nic_disable(). These functions are called during netdev attach/detach after RX & TX are set up. This ensures that the keys will be gone by the time the PD is destroyed. Fixes: 89ee2d92f66c ("net/mlx5e: Support PSP offload functionality") Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260504181100.269334-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 5a46870c4b74..8e9443caa933 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -6023,7 +6023,6 @@ static int mlx5e_nic_init(struct mlx5_core_dev *mdev, if (take_rtnl) rtnl_lock(); - mlx5e_psp_register(priv); /* update XDP supported features */ mlx5e_set_xdp_feature(priv); @@ -6036,7 +6035,6 @@ static int mlx5e_nic_init(struct mlx5_core_dev *mdev, static void mlx5e_nic_cleanup(struct mlx5e_priv *priv) { mlx5e_health_destroy_reporters(priv); - mlx5e_psp_unregister(priv); mlx5e_ktls_cleanup(priv); mlx5e_psp_cleanup(priv); mlx5e_fs_cleanup(priv->fs); @@ -6160,6 +6158,7 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv) mlx5e_fs_init_l2_addr(priv->fs, netdev); mlx5e_ipsec_init(priv); + mlx5e_psp_register(priv); err = mlx5e_macsec_init(priv); if (err) @@ -6230,6 +6229,7 @@ static void mlx5e_nic_disable(struct mlx5e_priv *priv) mlx5_lag_remove_netdev(mdev, priv->netdev); mlx5_vxlan_reset_to_default(mdev->vxlan); mlx5e_macsec_cleanup(priv); + mlx5e_psp_unregister(priv); mlx5e_ipsec_cleanup(priv); } -- cgit v1.2.3 From 3abcedfdfd3125431ed404fa75724118beac630b Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 4 May 2026 21:02:03 +0300 Subject: net/mlx5: SD: Serialize init/cleanup mlx5_sd_init() / mlx5_sd_cleanup() may run from multiple PFs in the same Socket-Direct group. This can cause the SD bring-up/tear-down sequence to be executed more than once or interleaved across PFs. Protect SD init/cleanup with mlx5_devcom_comp_lock() and track the SD group state on the primary device. Skip init if the primary is already UP, and skip cleanup unless the primary is UP. The state check on cleanup is needed because sd_register() drops the devcom comp lock between marking the comp ready and assigning primary_dev on each peer. A concurrent cleanup that acquires the lock in this window would observe devcom_is_ready==true while primary_dev is still NULL (causing mlx5_sd_get_primary() to return NULL) or while the FW alias setup performed by mlx5_sd_init()'s body has not yet run (causing sd_cmd_unset_primary() to dereference a NULL tx_ft). Gate the cleanup body on primary_sd->state == MLX5_SD_STATE_UP, which is set only at the very end of mlx5_sd_init() under the same comp lock - so observing UP guarantees primary_dev, secondaries[], tx_ft, and dfs are all populated. Also bail explicitly if mlx5_sd_get_primary() returns NULL, in case state is checked on a peer whose primary_dev hasn't been assigned yet. In addition, move mlx5_devcom_comp_set_ready(false) from sd_unregister() into the cleanup's locked section, including the !primary and state != UP early-exit paths, so the device cannot unregister and free its struct mlx5_sd while devcom is still marked ready. A concurrent init acquiring the devcom lock will now observe devcom is no longer ready and bail out immediately. Fixes: 381978d28317 ("net/mlx5e: Create single netdev per SD group") Signed-off-by: Shay Drory Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260504180206.268568-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c | 56 +++++++++++++++++++++--- 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index 762c783156b4..ec42685bdece 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -18,6 +18,7 @@ struct mlx5_sd { u8 host_buses; struct mlx5_devcom_comp_dev *devcom; struct dentry *dfs; + u8 state; bool primary; union { struct { /* primary */ @@ -31,6 +32,11 @@ struct mlx5_sd { }; }; +enum mlx5_sd_state { + MLX5_SD_STATE_DOWN = 0, + MLX5_SD_STATE_UP, +}; + static int mlx5_sd_get_host_buses(struct mlx5_core_dev *dev) { struct mlx5_sd *sd = mlx5_get_sd(dev); @@ -270,9 +276,6 @@ static void sd_unregister(struct mlx5_core_dev *dev) { struct mlx5_sd *sd = mlx5_get_sd(dev); - mlx5_devcom_comp_lock(sd->devcom); - mlx5_devcom_comp_set_ready(sd->devcom, false); - mlx5_devcom_comp_unlock(sd->devcom); mlx5_devcom_unregister_component(sd->devcom); } @@ -426,6 +429,7 @@ int mlx5_sd_init(struct mlx5_core_dev *dev) struct mlx5_core_dev *primary, *pos, *to; struct mlx5_sd *sd = mlx5_get_sd(dev); u8 alias_key[ACCESS_KEY_LEN]; + struct mlx5_sd *primary_sd; int err, i; err = sd_init(dev); @@ -440,10 +444,17 @@ int mlx5_sd_init(struct mlx5_core_dev *dev) if (err) goto err_sd_cleanup; + mlx5_devcom_comp_lock(sd->devcom); if (!mlx5_devcom_comp_is_ready(sd->devcom)) - return 0; + goto out; primary = mlx5_sd_get_primary(dev); + if (!primary) + goto out; + + primary_sd = mlx5_get_sd(primary); + if (primary_sd->state != MLX5_SD_STATE_DOWN) + goto out; for (i = 0; i < ACCESS_KEY_LEN; i++) alias_key[i] = get_random_u8(); @@ -472,6 +483,9 @@ int mlx5_sd_init(struct mlx5_core_dev *dev) sd->group_id, mlx5_devcom_comp_get_size(sd->devcom)); sd_print_group(primary); + primary_sd->state = MLX5_SD_STATE_UP; +out: + mlx5_devcom_comp_unlock(sd->devcom); return 0; err_unset_secondaries: @@ -481,6 +495,15 @@ err_unset_secondaries: sd_cmd_unset_primary(primary); debugfs_remove_recursive(sd->dfs); err_sd_unregister: + mlx5_sd_for_each_secondary(i, primary, pos) { + struct mlx5_sd *peer_sd = mlx5_get_sd(pos); + + primary_sd->secondaries[i - 1] = NULL; + peer_sd->primary_dev = NULL; + } + primary_sd->primary = false; + mlx5_devcom_comp_set_ready(sd->devcom, false); + mlx5_devcom_comp_unlock(sd->devcom); sd_unregister(dev); err_sd_cleanup: sd_cleanup(dev); @@ -491,22 +514,43 @@ void mlx5_sd_cleanup(struct mlx5_core_dev *dev) { struct mlx5_sd *sd = mlx5_get_sd(dev); struct mlx5_core_dev *primary, *pos; + struct mlx5_sd *primary_sd; int i; if (!sd) return; + mlx5_devcom_comp_lock(sd->devcom); if (!mlx5_devcom_comp_is_ready(sd->devcom)) - goto out; + goto out_unlock; primary = mlx5_sd_get_primary(dev); + if (!primary) + goto out_ready_false; + + primary_sd = mlx5_get_sd(primary); + if (primary_sd->state != MLX5_SD_STATE_UP) + goto out_clear_peers; + mlx5_sd_for_each_secondary(i, primary, pos) sd_cmd_unset_secondary(pos); sd_cmd_unset_primary(primary); debugfs_remove_recursive(sd->dfs); sd_info(primary, "group id %#x, uncombined\n", sd->group_id); -out: + primary_sd->state = MLX5_SD_STATE_DOWN; +out_clear_peers: + mlx5_sd_for_each_secondary(i, primary, pos) { + struct mlx5_sd *peer_sd = mlx5_get_sd(pos); + + primary_sd->secondaries[i - 1] = NULL; + peer_sd->primary_dev = NULL; + } + primary_sd->primary = false; +out_ready_false: + mlx5_devcom_comp_set_ready(sd->devcom, false); +out_unlock: + mlx5_devcom_comp_unlock(sd->devcom); sd_unregister(dev); sd_cleanup(dev); } -- cgit v1.2.3 From 05217e4ffbb229e7218cf318e0033780abadb624 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 4 May 2026 21:02:04 +0300 Subject: net/mlx5: SD, Keep multi-pf debugfs entries on primary mlx5_sd_init() creates the "multi-pf" debugfs directory under the primary device debugfs root, but stored the dentry in the calling device's sd struct. When sd_cleanup() run on a different PF, this leads to using the wrong sd->dfs for removing entries, which results in memory leak and an error in when re-creating the SD.[1] Fix it by explicitly storing the debugfs dentry in the primary device sd struct and use it for all per-group files. [1] debugfs: 'multi-pf' already exists in '0000:08:00.1' Fixes: 4375130bf527 ("net/mlx5: SD, Add debugfs") Signed-off-by: Shay Drory Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260504180206.268568-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index ec42685bdece..89b7e4d67303 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -463,9 +463,13 @@ int mlx5_sd_init(struct mlx5_core_dev *dev) if (err) goto err_sd_unregister; - sd->dfs = debugfs_create_dir("multi-pf", mlx5_debugfs_get_dev_root(primary)); - debugfs_create_x32("group_id", 0400, sd->dfs, &sd->group_id); - debugfs_create_file("primary", 0400, sd->dfs, primary, &dev_fops); + primary_sd->dfs = + debugfs_create_dir("multi-pf", + mlx5_debugfs_get_dev_root(primary)); + debugfs_create_x32("group_id", 0400, primary_sd->dfs, + &primary_sd->group_id); + debugfs_create_file("primary", 0400, primary_sd->dfs, primary, + &dev_fops); mlx5_sd_for_each_secondary(i, primary, pos) { char name[32]; @@ -475,7 +479,8 @@ int mlx5_sd_init(struct mlx5_core_dev *dev) goto err_unset_secondaries; snprintf(name, sizeof(name), "secondary_%d", i - 1); - debugfs_create_file(name, 0400, sd->dfs, pos, &dev_fops); + debugfs_create_file(name, 0400, primary_sd->dfs, pos, + &dev_fops); } @@ -493,7 +498,8 @@ err_unset_secondaries: mlx5_sd_for_each_secondary_to(i, primary, to, pos) sd_cmd_unset_secondary(pos); sd_cmd_unset_primary(primary); - debugfs_remove_recursive(sd->dfs); + debugfs_remove_recursive(primary_sd->dfs); + primary_sd->dfs = NULL; err_sd_unregister: mlx5_sd_for_each_secondary(i, primary, pos) { struct mlx5_sd *peer_sd = mlx5_get_sd(pos); @@ -535,7 +541,8 @@ void mlx5_sd_cleanup(struct mlx5_core_dev *dev) mlx5_sd_for_each_secondary(i, primary, pos) sd_cmd_unset_secondary(pos); sd_cmd_unset_primary(primary); - debugfs_remove_recursive(sd->dfs); + debugfs_remove_recursive(primary_sd->dfs); + primary_sd->dfs = NULL; sd_info(primary, "group id %#x, uncombined\n", sd->group_id); primary_sd->state = MLX5_SD_STATE_DOWN; -- cgit v1.2.3 From 3564222cfdde83a2d760b80192155a3ada1c9bdd Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 4 May 2026 21:02:05 +0300 Subject: net/mlx5e: SD, Fix missing cleanup on probe error When _mlx5e_probe() fails, the preceding successful mlx5_sd_init() is not undone. Auxiliary bus probe failure skips binding, so mlx5e_remove() is never called for that adev and the matching mlx5_sd_cleanup() never runs - leaking the per-dev SD struct. Call mlx5_sd_cleanup() on the probe error path to balance mlx5_sd_init(). A similar gap exists on the resume path: mlx5_sd_init() and mlx5_sd_cleanup() are currently bundled with both probe/remove and suspend/resume, even though only the FW alias state actually needs to follow the suspend/resume lifecycle - the sd struct allocation and devcom membership are software state that should track the full bound lifetime. As a result, a failed resume can leave a still-bound device with sd == NULL, which mlx5_sd_get_adev() can't distinguish from a non-SD device. Fixing this requires sd_suspend/resume APIs which will only destroy FW resources and is left for a follow-up series. Fixes: 381978d28317 ("net/mlx5e: Create single netdev per SD group") Signed-off-by: Shay Drory Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260504180206.268568-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 8e9443caa933..62b70334a13d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -6775,8 +6775,8 @@ static int mlx5e_resume(struct auxiliary_device *adev) actual_adev = mlx5_sd_get_adev(mdev, adev, edev->idx); if (actual_adev) - return _mlx5e_resume(actual_adev); - return 0; + err = _mlx5e_resume(actual_adev); + return err; } static int _mlx5e_suspend(struct auxiliary_device *adev, bool pre_netdev_reg) @@ -6912,9 +6912,16 @@ static int mlx5e_probe(struct auxiliary_device *adev, return err; actual_adev = mlx5_sd_get_adev(mdev, adev, edev->idx); - if (actual_adev) - return _mlx5e_probe(actual_adev); + if (actual_adev) { + err = _mlx5e_probe(actual_adev); + if (err) + goto sd_cleanup; + } return 0; + +sd_cleanup: + mlx5_sd_cleanup(mdev); + return err; } static void _mlx5e_remove(struct auxiliary_device *adev) -- cgit v1.2.3 From d466ddda5500b6b8ae060909d2317811f2c32a6a Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 4 May 2026 21:02:06 +0300 Subject: net/mlx5e: SD, Fix race condition in secondary device probe/remove When utilizing Socket-Direct single netdev functionality the driver resolves the actual auxiliary device using mlx5_sd_get_adev(). However, the current implementation returns the primary ETH auxiliary device without holding the device lock, leading to a potential race condition where the ETH device could be unbound or removed concurrently during probe, suspend, resume, or remove operations.[1] Fix this by introducing mlx5_sd_put_adev() and updating mlx5_sd_get_adev() so that secondaries devices would get a ref and acquire the device lock of the returned auxiliary device. After the lock is acquired, a second devcom check is needed[2]. In addition, update The callers to pair the get operation with the new put operation, ensuring the lock is held while the auxiliary device is being operated on and released afterwards. The "primary" designation is determined once in sd_register(). It's set before devcom is marked ready, and it never changes after that. In Addition, The primary path never locks a secondary: When the primary device invoke mlx5_sd_get_adev(), it sees dev == primary and returns. no additional lock is taken. Therefore lock ordering is always: secondary_lock -> primary_lock. The reverse never happens, so ABBA deadlock is impossible. [1] for example: BUG: kernel NULL pointer dereference, address: 0000000000000370 PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP CPU: 4 UID: 0 PID: 3945 Comm: bash Not tainted 6.19.0-rc3+ #1 NONE Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:mlx5e_dcbnl_dscp_app+0x23/0x100 [mlx5_core] Call Trace: mlx5e_remove+0x82/0x12a [mlx5_core] device_release_driver_internal+0x194/0x1f0 bus_remove_device+0xc6/0x140 device_del+0x159/0x3c0 ? devl_param_driverinit_value_get+0x29/0x80 mlx5_rescan_drivers_locked+0x92/0x160 [mlx5_core] mlx5_unregister_device+0x34/0x50 [mlx5_core] mlx5_uninit_one+0x43/0xb0 [mlx5_core] remove_one+0x4e/0xc0 [mlx5_core] pci_device_remove+0x39/0xa0 device_release_driver_internal+0x194/0x1f0 unbind_store+0x99/0xa0 kernfs_fop_write_iter+0x12e/0x1e0 vfs_write+0x215/0x3d0 ksys_write+0x5f/0xd0 do_syscall_64+0x55/0xe90 entry_SYSCALL_64_after_hwframe+0x4b/0x53 [2] CPU0 (primary) CPU1 (secondary) ========================================================================== mlx5e_remove() (device_lock held) mlx5e_remove() (2nd device_lock held) mlx5_sd_get_adev() mlx5_devcom_comp_is_ready() => true device_lock(primary) mlx5_sd_get_adev() ==> ret adev _mlx5e_remove() mlx5_sd_cleanup() // mlx5e_remove finished // releasing device_lock //need another check here... mlx5_devcom_comp_is_ready() => false Fixes: 381978d28317 ("net/mlx5e: Create single netdev per SD group") Signed-off-by: Shay Drory Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260504180206.268568-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 11 ++++++- drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c | 39 +++++++++++++++++++++-- drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h | 2 ++ 3 files changed, 48 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 62b70334a13d..8f2b3abe0092 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -6774,8 +6774,10 @@ static int mlx5e_resume(struct auxiliary_device *adev) return err; actual_adev = mlx5_sd_get_adev(mdev, adev, edev->idx); - if (actual_adev) + if (actual_adev) { err = _mlx5e_resume(actual_adev); + mlx5_sd_put_adev(actual_adev, adev); + } return err; } @@ -6815,6 +6817,8 @@ static int mlx5e_suspend(struct auxiliary_device *adev, pm_message_t state) err = _mlx5e_suspend(actual_adev, false); mlx5_sd_cleanup(mdev); + if (actual_adev) + mlx5_sd_put_adev(actual_adev, adev); return err; } @@ -6916,11 +6920,14 @@ static int mlx5e_probe(struct auxiliary_device *adev, err = _mlx5e_probe(actual_adev); if (err) goto sd_cleanup; + mlx5_sd_put_adev(actual_adev, adev); } return 0; sd_cleanup: mlx5_sd_cleanup(mdev); + if (actual_adev) + mlx5_sd_put_adev(actual_adev, adev); return err; } @@ -6973,6 +6980,8 @@ static void mlx5e_remove(struct auxiliary_device *adev) _mlx5e_remove(actual_adev); mlx5_sd_cleanup(mdev); + if (actual_adev) + mlx5_sd_put_adev(actual_adev, adev); } static const struct auxiliary_device_id mlx5e_id_table[] = { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index 89b7e4d67303..6e199161b008 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -562,22 +562,55 @@ out_unlock: sd_cleanup(dev); } +/* Lock order: + * primary: actual_adev_lock -> SD devcom comp lock + * secondary: SD devcom comp lock -> (drop) -> actual_adev_lock + * The two locks are never held together, so no ABBA. + */ struct auxiliary_device *mlx5_sd_get_adev(struct mlx5_core_dev *dev, struct auxiliary_device *adev, int idx) { struct mlx5_sd *sd = mlx5_get_sd(dev); struct mlx5_core_dev *primary; + struct mlx5_adev *primary_adev; if (!sd) return adev; - if (!mlx5_devcom_comp_is_ready(sd->devcom)) + mlx5_devcom_comp_lock(sd->devcom); + if (!mlx5_devcom_comp_is_ready(sd->devcom)) { + mlx5_devcom_comp_unlock(sd->devcom); return NULL; + } primary = mlx5_sd_get_primary(dev); - if (dev == primary) + if (!primary || dev == primary) { + mlx5_devcom_comp_unlock(sd->devcom); return adev; + } + + primary_adev = primary->priv.adev[idx]; + get_device(&primary_adev->adev.dev); + mlx5_devcom_comp_unlock(sd->devcom); - return &primary->priv.adev[idx]->adev; + device_lock(&primary_adev->adev.dev); + /* Primary may have completed remove between dropping devcom and + * acquiring device_lock; recheck. + */ + if (!mlx5_devcom_comp_is_ready(sd->devcom)) { + device_unlock(&primary_adev->adev.dev); + put_device(&primary_adev->adev.dev); + return NULL; + } + return &primary_adev->adev; +} + +void mlx5_sd_put_adev(struct auxiliary_device *actual_adev, + struct auxiliary_device *adev) +{ + if (actual_adev != adev) { + device_unlock(&actual_adev->dev); + put_device(&actual_adev->dev); + } } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h index 137efaf9aabc..9bfd5b9756b5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h @@ -15,6 +15,8 @@ struct mlx5_core_dev *mlx5_sd_ch_ix_get_dev(struct mlx5_core_dev *primary, int c struct auxiliary_device *mlx5_sd_get_adev(struct mlx5_core_dev *dev, struct auxiliary_device *adev, int idx); +void mlx5_sd_put_adev(struct auxiliary_device *actual_adev, + struct auxiliary_device *adev); int mlx5_sd_init(struct mlx5_core_dev *dev); void mlx5_sd_cleanup(struct mlx5_core_dev *dev); -- cgit v1.2.3 From d73a9a63f9f7f7c17637731fd28daf3665992d1e Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 2 May 2026 23:07:15 +0300 Subject: xsk: reject sw-csum UMEM binding to IFF_TX_SKB_NO_LINEAR devices skb_checksum_help() is a common helper that writes the folded 16-bit checksum back via skb->data + csum_start + csum_offset, i.e. it relies on the skb's linear head and fails (with WARN_ONCE and -EINVAL) when skb_headlen() is 0. AF_XDP generic xmit takes two very different paths depending on the netdev. Drivers that advertise IFF_TX_SKB_NO_LINEAR (e.g. virtio_net) skip the "copy payload into a linear head" step on purpose as a performance optimisation: xsk_build_skb_zerocopy() only attaches UMEM pages as frags and never calls skb_put(), so skb_headlen() stays 0 for the whole skb. For these skbs there is simply no linear area for skb_checksum_help() to write the csum into - the sw-csum fallback is structurally inapplicable. The patch tries to catch this and reject the combination with error at setup time. Rejecting at bind() converts this silent per-packet failure into a synchronous, actionable -EOPNOTSUPP at setup time. HW csum and launch_time metadata on IFF_TX_SKB_NO_LINEAR drivers are unaffected because they do not call skb_checksum_help(). Without the patch, every descriptor carrying 'XDP_TX_METADATA | XDP_TXMD_FLAGS_CHECKSUM' produces: 1) a WARN_ONCE "offset (N) >= skb_headlen() (0)" from skb_checksum_help(), 2) sendmsg() returning -EINVAL without consuming the descriptor (invalid_descs is not incremented), 3) a wedged TX ring: __xsk_generic_xmit() does not advance the consumer on non-EOVERFLOW errors, so the next sendmsg() re-reads the same descriptor and re-hits the same WARN until the socket is closed. Closes: https://lore.kernel.org/all/20260419045822.843BFC2BCAF@smtp.kernel.org/#t Acked-by: Stanislav Fomichev Signed-off-by: Jason Xing Acked-by: Stanislav Fomichev Signed-off-by: Jason Xing Reviewed-by: Alexander Lobakin Fixes: 30c3055f9c0d ("xsk: wrap generic metadata handling onto separate function") Link: https://patch.msgid.link/20260502200722.53960-2-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk_buff_pool.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index cd7bc50872f6..d981cfdd8535 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -175,6 +175,9 @@ int xp_assign_dev(struct xsk_buff_pool *pool, if (force_zc && force_copy) return -EINVAL; + if (pool->tx_sw_csum && (netdev->priv_flags & IFF_TX_SKB_NO_LINEAR)) + return -EOPNOTSUPP; + if (xsk_get_pool_from_qid(netdev, queue_id)) return -EBUSY; -- cgit v1.2.3 From 0bb7a9caf5c1d6e25ba376ea6b39261ad28550f4 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 2 May 2026 23:07:16 +0300 Subject: xsk: free the skb when hitting the upper bound MAX_SKB_FRAGS Fix it by explicitly adding kfree_skb() before returning back to its caller. How to reproduce it in virtio_net: 1. the current skb is the first one (which means xs->skb is NULL) and hit the limit MAX_SKB_FRAGS. 2. xsk_build_skb_zerocopy() returns -EOVERFLOW. 3. the caller xsk_build_skb() clears skb by using 'skb = NULL;'. This is why bug can be triggered. 4. there is no chance to free this skb anymore. Note that if in this case the xs->skb is not NULL, xsk_build_skb() will call xsk_drop_skb(xs->skb) to do the right thing. Fixes: cf24f5a5feea ("xsk: add support for AF_XDP multi-buffer on Tx path") Acked-by: Stanislav Fomichev Signed-off-by: Jason Xing Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/20260502200722.53960-3-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 887abed25466..d706b1e0bf60 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -856,8 +856,11 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, addr = buffer - pool->addrs; for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) { - if (unlikely(i >= MAX_SKB_FRAGS)) + if (unlikely(i >= MAX_SKB_FRAGS)) { + if (!xs->skb) + kfree_skb(skb); return ERR_PTR(-EOVERFLOW); + } page = pool->umem->pgs[addr >> PAGE_SHIFT]; get_page(page); -- cgit v1.2.3 From 8cd3c1c6e7d9a1f0954159ec5f2fdaa7f6a48bd8 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 2 May 2026 23:07:17 +0300 Subject: xsk: handle NULL dereference of the skb without frags issue When a first descriptor (xs->skb == NULL) triggers -EOVERFLOW in xsk_build_skb_zerocopy() (e.g., MAX_SKB_FRAGS exceeded), the free_err -EOVERFLOW handler unconditionally dereferences xs->skb via xsk_inc_num_desc(xs->skb) and xsk_drop_skb(xs->skb), causing a NULL pointer dereference. Fix this by guarding the existing xsk_inc_num_desc()/xsk_drop_skb() calls with an xs->skb check (for the continuation case), and add an else branch for the first-descriptor case that manually cancels the one reserved CQ slot and increments invalid_descs by one to account for the single invalid descriptor. Fixes: cf24f5a5feea ("xsk: add support for AF_XDP multi-buffer on Tx path") Acked-by: Stanislav Fomichev Signed-off-by: Jason Xing Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/20260502200722.53960-4-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index d706b1e0bf60..06ee260f3afc 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -976,9 +976,14 @@ free_err: kfree_skb(skb); if (err == -EOVERFLOW) { - /* Drop the packet */ - xsk_inc_num_desc(xs->skb); - xsk_drop_skb(xs->skb); + if (xs->skb) { + /* Drop the packet */ + xsk_inc_num_desc(xs->skb); + xsk_drop_skb(xs->skb); + } else { + xsk_cq_cancel_locked(xs->pool, 1); + xs->tx->invalid_descs++; + } xskq_cons_release(xs->tx); } else { /* Let application retry */ -- cgit v1.2.3 From 0f3776583d282550dbafe6082a914efcf9094d59 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 2 May 2026 23:07:18 +0300 Subject: xsk: fix use-after-free of xs->skb in xsk_build_skb() free_err path When xsk_build_skb() processes multi-buffer packets in copy mode, the first descriptor stores data into the skb linear area without adding any frags, so nr_frags stays at 0. The caller then sets xs->skb = skb to accumulate subsequent descriptors. If a continuation descriptor fails (e.g. alloc_page returns NULL with -EAGAIN), we jump to free_err where the condition: if (skb && !skb_shinfo(skb)->nr_frags) kfree_skb(skb); evaluates to true because nr_frags is still 0 (the first descriptor used the linear area, not frags). This frees the skb while xs->skb still points to it, creating a dangling pointer. On the next transmit attempt or socket close, xs->skb is dereferenced, causing a use-after-free or double-free. Fix by using a !xs->skb check to handle first frag situation, ensuring we only free skbs that were freshly allocated in this call (xs->skb is NULL) and never free an in-progress multi-buffer skb that the caller still references. Closes: https://lore.kernel.org/all/20260415082654.21026-4-kerneljasonxing@gmail.com/ Fixes: 6b9c129c2f93 ("xsk: remove @first_frag from xsk_build_skb()") Acked-by: Stanislav Fomichev Signed-off-by: Jason Xing Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/20260502200722.53960-5-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 06ee260f3afc..55378c3855d5 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -972,7 +972,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, return skb; free_err: - if (skb && !skb_shinfo(skb)->nr_frags) + if (skb && !xs->skb) kfree_skb(skb); if (err == -EOVERFLOW) { -- cgit v1.2.3 From 3dec153ae484e3b2ddac841156e197ba54c8df94 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 2 May 2026 23:07:19 +0300 Subject: xsk: prevent CQ desync when freeing half-built skbs in xsk_build_skb() Once xsk_skb_init_misc() has been called on an skb, its destructor is set to xsk_destruct_skb(), which submits the descriptor address(es) to the completion queue and advances the CQ producer. If such an skb is subsequently freed via kfree_skb() along an error path - before the skb has ever been handed to the driver - the destructor still runs and submits a bogus, half-initialized address to the CQ. Postpone the init phase when we believe the allocation of first frag is successfully completed. Before this init, skb can be safely freed by kfree_skb(). Closes: https://lore.kernel.org/all/20260419045822.843BFC2BCAF@smtp.kernel.org/ Fixes: c30d084960cf ("xsk: avoid overwriting skb fields for multi-buffer traffic") Acked-by: Stanislav Fomichev Signed-off-by: Jason Xing Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/20260502200722.53960-6-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 55378c3855d5..af3c5752bb63 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -819,8 +819,6 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, return ERR_PTR(err); skb_reserve(skb, hr); - - xsk_skb_init_misc(skb, xs, desc->addr); if (desc->options & XDP_TX_METADATA) { err = xsk_skb_metadata(skb, buffer, desc, pool, hr); if (unlikely(err)) @@ -917,7 +915,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, if (unlikely(err)) goto free_err; - xsk_skb_init_misc(skb, xs, desc->addr); if (desc->options & XDP_TX_METADATA) { err = xsk_skb_metadata(skb, buffer, desc, xs->pool, hr); @@ -967,6 +964,8 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, } } + if (!xs->skb) + xsk_skb_init_misc(skb, xs, desc->addr); xsk_inc_num_desc(skb); return skb; -- cgit v1.2.3 From 8c2cff50afdd2b53c7cc2ca2297301c0ffd3e802 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 2 May 2026 23:07:20 +0300 Subject: xsk: avoid skb leak in XDP_TX_METADATA case Fix it by explicitly adding kfree_skb() before returning back to its caller. How to reproduce it in virtio_net: 1. the current skb is the first one (which means no frag and xs->skb is NULL) and users enable metadata feature. 2. xsk_skb_metadata() returns a error code. 3. the caller xsk_build_skb() clears skb by using 'skb = NULL;'. 4. there is no chance to free this skb anymore. Closes: https://lore.kernel.org/all/20260415085204.3F87AC19424@smtp.kernel.org/ Fixes: 30c3055f9c0d ("xsk: wrap generic metadata handling onto separate function") Acked-by: Stanislav Fomichev Signed-off-by: Jason Xing Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/20260502200722.53960-7-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index af3c5752bb63..770ba4695a9d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -821,8 +821,10 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, skb_reserve(skb, hr); if (desc->options & XDP_TX_METADATA) { err = xsk_skb_metadata(skb, buffer, desc, pool, hr); - if (unlikely(err)) + if (unlikely(err)) { + kfree_skb(skb); return ERR_PTR(err); + } } } else { struct xsk_addrs *xsk_addr; -- cgit v1.2.3 From e0f229025a8e774a695017a376c4a01279c0e66e Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 2 May 2026 23:07:21 +0300 Subject: xsk: fix xsk_addrs slab leak on multi-buffer error path When xsk_build_skb() / xsk_build_skb_zerocopy() sees the first continuation descriptor, it promotes destructor_arg from an inlined address to a freshly allocated xsk_addrs (num_descs = 1). The counter is bumped to >= 2 only at the very end of a successful build (by calling xsk_inc_num_desc()). If the build fails in between (e.g. alloc_page() returns NULL with -EAGAIN, or the MAX_SKB_FRAGS overflow hits), we jump to free_err, skip calling xsk_inc_num_desc() to increment num_descs and leave the half-built skb attached to xs->skb for the app to retry. The skb now has 1) destructor_arg = a real xsk_addrs pointer, 2) num_descs = 1 If the app never retries and just close()s the socket, xsk_release() calls xsk_drop_skb() -> xsk_consume_skb(), which decides whether to free xsk_addrs by testing num_descs > 1: if (unlikely(num_descs > 1)) kmem_cache_free(xsk_tx_generic_cache, destructor_arg); Because num_descs is exactly 1 the branch is skipped and the xsk_addrs object is leaked to the xsk_tx_generic_cache slab. Fix it by directly testing if destructor_arg is still addr. Or else it is modified and used to store the newly allocated memory from xsk_tx_generic_cache regardless of increment of num_desc, which we need to handle. Closes: https://lore.kernel.org/all/20260419045824.D9E5EC2BCAF@smtp.kernel.org/ Fixes: 0ebc27a4c67d ("xsk: avoid data corruption on cq descriptor number") Acked-by: Stanislav Fomichev Signed-off-by: Jason Xing Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/20260502200722.53960-8-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 770ba4695a9d..079abd4bcb69 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -685,7 +685,7 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, spin_lock_irqsave(&pool->cq_prod_lock, flags); idx = xskq_get_prod(pool->cq); - if (unlikely(num_descs > 1)) { + if (unlikely(!xsk_skb_destructor_is_addr(skb))) { xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; for (i = 0; i < num_descs; i++) { @@ -740,7 +740,7 @@ static void xsk_consume_skb(struct sk_buff *skb) u32 num_descs = xsk_get_num_desc(skb); struct xsk_addrs *xsk_addr; - if (unlikely(num_descs > 1)) { + if (unlikely(!xsk_skb_destructor_is_addr(skb))) { xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; kmem_cache_free(xsk_tx_generic_cache, xsk_addr); } -- cgit v1.2.3 From 203cee647f551abc87b992045cd920b117ff990a Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 2 May 2026 23:07:22 +0300 Subject: xsk: fix u64 descriptor address truncation on 32-bit architectures In copy mode TX, xsk_skb_destructor_set_addr() stores the 64-bit descriptor address into skb_shinfo(skb)->destructor_arg (void *) via a uintptr_t cast: skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL); On 32-bit architectures uintptr_t is 32 bits, so the upper 32 bits of the descriptor address are silently dropped. In XDP_ZEROCOPY unaligned mode the chunk offset is encoded in bits 48-63 of the descriptor address (XSK_UNALIGNED_BUF_OFFSET_SHIFT = 48), meaning the offset is lost entirely. The completion queue then returns a truncated address to userspace, making buffer recycling impossible. Fix this by handling the 32-bit case directly in xsk_skb_destructor_set_addr(): when !CONFIG_64BIT, allocate an xsk_addrs struct (the same path already used for multi-descriptor SKBs) to store the full u64 address. The existing tagged-pointer logic in xsk_skb_destructor_is_addr() stays unchanged: slab pointers returned from kmem_cache_zalloc() are always word-aligned and therefore have bit 0 clear, which correctly identifies them as a struct pointer rather than an inline tagged address on every architecture. Factor the shared kmem_cache_zalloc + destructor_arg assignment into __xsk_addrs_alloc() and add a wrapper xsk_addrs_alloc() that handles the inline-to-list upgrade (is_addr check + get_addr + num_descs = 1). The three former open-coded kmem_cache_zalloc call sites now reduce to a single call each. Propagate the -ENOMEM from xsk_skb_destructor_set_addr() through xsk_skb_init_misc() so the caller can clean up the skb via kfree_skb() before skb->destructor is installed. The overhead is one extra kmem_cache_zalloc per first descriptor on 32-bit only; 64-bit builds are completely unchanged. Closes: https://lore.kernel.org/all/20260419045824.D9E5EC2BCAF@smtp.kernel.org/ Fixes: 0ebc27a4c67d ("xsk: avoid data corruption on cq descriptor number") Signed-off-by: Jason Xing Acked-by: Stanislav Fomichev Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/20260502200722.53960-9-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski --- net/xdp/xsk.c | 88 +++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 56 insertions(+), 32 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 079abd4bcb69..5e5786cd9af5 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -646,9 +646,42 @@ static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb) return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL); } -static void xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr) +static struct xsk_addrs *__xsk_addrs_alloc(struct sk_buff *skb, u64 addr) { - skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL); + struct xsk_addrs *xsk_addr; + + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); + if (unlikely(!xsk_addr)) + return NULL; + + xsk_addr->addrs[0] = addr; + skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; + return xsk_addr; +} + +static struct xsk_addrs *xsk_addrs_alloc(struct sk_buff *skb) +{ + struct xsk_addrs *xsk_addr; + + if (!xsk_skb_destructor_is_addr(skb)) + return (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + + xsk_addr = __xsk_addrs_alloc(skb, xsk_skb_destructor_get_addr(skb)); + if (likely(xsk_addr)) + xsk_addr->num_descs = 1; + return xsk_addr; +} + +static int xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr) +{ + if (IS_ENABLED(CONFIG_64BIT)) { + skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL); + return 0; + } + + if (unlikely(!__xsk_addrs_alloc(skb, addr))) + return -ENOMEM; + return 0; } static void xsk_inc_num_desc(struct sk_buff *skb) @@ -724,14 +757,20 @@ void xsk_destruct_skb(struct sk_buff *skb) sock_wfree(skb); } -static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs, - u64 addr) +static int xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs, + u64 addr) { + int err; + + err = xsk_skb_destructor_set_addr(skb, addr); + if (unlikely(err)) + return err; + skb->dev = xs->dev; skb->priority = READ_ONCE(xs->sk.sk_priority); skb->mark = READ_ONCE(xs->sk.sk_mark); skb->destructor = xsk_destruct_skb; - xsk_skb_destructor_set_addr(skb, addr); + return 0; } static void xsk_consume_skb(struct sk_buff *skb) @@ -829,18 +868,9 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, } else { struct xsk_addrs *xsk_addr; - if (xsk_skb_destructor_is_addr(skb)) { - xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, - GFP_KERNEL); - if (!xsk_addr) - return ERR_PTR(-ENOMEM); - - xsk_addr->num_descs = 1; - xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); - skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; - } else { - xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; - } + xsk_addr = xsk_addrs_alloc(skb); + if (!xsk_addr) + return ERR_PTR(-ENOMEM); /* in case of -EOVERFLOW that could happen below, * xsk_consume_skb() will release this node as whole skb @@ -929,19 +959,10 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct page *page; u8 *vaddr; - if (xsk_skb_destructor_is_addr(skb)) { - xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, - GFP_KERNEL); - if (!xsk_addr) { - err = -ENOMEM; - goto free_err; - } - - xsk_addr->num_descs = 1; - xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); - skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; - } else { - xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + xsk_addr = xsk_addrs_alloc(skb); + if (!xsk_addr) { + err = -ENOMEM; + goto free_err; } if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { @@ -966,8 +987,11 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, } } - if (!xs->skb) - xsk_skb_init_misc(skb, xs, desc->addr); + if (!xs->skb) { + err = xsk_skb_init_misc(skb, xs, desc->addr); + if (unlikely(err)) + goto free_err; + } xsk_inc_num_desc(skb); return skb; -- cgit v1.2.3 From 283fc9e44ff5b5ac967439b4951b80bd4299f4e4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 5 May 2026 15:15:34 +0200 Subject: wifi: mac80211: remove station if connection prep fails If connection preparation fails for MLO connections, then the interface is completely reset to non-MLD. In this case, we must not keep the station since it's related to the link of the vif being removed. Delete an existing station. Any "new_sta" is already being removed, so that doesn't need changes. This fixes a use-after-free/double-free in debugfs if that's enabled, because a vif going from MLD (and to MLD, but that's not relevant here) recreates its entire debugfs. Cc: stable@vger.kernel.org Fixes: 81151ce462e5 ("wifi: mac80211: support MLO authentication/association with one link") Reviewed-by: Miriam Rachel Korenblit Link: https://patch.msgid.link/20260505151533.c4e52deb06ad.Iafe56cec7de8512626169496b134bce3a6c17010@changeid Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 298ebff6bbf8..0a0f27836d57 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -9149,7 +9149,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, struct ieee80211_bss *bss = (void *)cbss->priv; struct sta_info *new_sta = NULL; struct ieee80211_link_data *link; - bool have_sta = false; + struct sta_info *have_sta = NULL; bool mlo; int err; u16 new_links; @@ -9168,11 +9168,8 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, mlo = false; } - if (assoc) { - rcu_read_lock(); + if (assoc) have_sta = sta_info_get(sdata, ap_mld_addr); - rcu_read_unlock(); - } if (mlo && !have_sta && WARN_ON(sdata->vif.valid_links || sdata->vif.active_links)) @@ -9336,6 +9333,8 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, out_release_chan: ieee80211_link_release_channel(link); out_err: + if (mlo && have_sta) + WARN_ON(__sta_info_destroy(have_sta)); ieee80211_vif_set_links(sdata, 0, 0); return err; } -- cgit v1.2.3 From 0f3c0a197309717d74729568f88957d448847937 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 5 May 2026 13:38:37 +0200 Subject: wifi: nl80211: fix NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST usage This is documented as a u8 and has a policy of NLA_U8, but uses nla_get_u32() which means it's completely broken on big-endian. Fix it to use nla_get_u8(). Fixes: 9bb7e0f24e7e ("cfg80211: add peer measurement with FTM initiator API") Link: https://patch.msgid.link/20260505113837.260159-2-johannes@sipsolutions.net Signed-off-by: Johannes Berg --- net/wireless/pmsr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index 4c8ea0583f94..d6cd0de64d1f 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -88,7 +88,7 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, out->ftm.ftms_per_burst = 0; if (tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]) out->ftm.ftms_per_burst = - nla_get_u32(tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]); + nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]); if (capa->ftm.max_ftms_per_burst && (out->ftm.ftms_per_burst > capa->ftm.max_ftms_per_burst || -- cgit v1.2.3 From 15994bb0cbb8fc4879da7552ddd08c1896261c39 Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Wed, 6 May 2026 14:48:53 +0800 Subject: wifi: nl80211: require CAP_NET_ADMIN over the target netns in SET_WIPHY_NETNS NL80211_CMD_SET_WIPHY_NETNS dispatches with GENL_UNS_ADMIN_PERM, which verifies that the caller has CAP_NET_ADMIN for the source netns. It doesn't verify that the caller has CAP_NET_ADMIN over the target netns selected by NL80211_ATTR_NETNS_FD or NL80211_ATTR_PID. This diverges from the convention enforced in net/core/rtnetlink.c::rtnl_get_net_ns_capable(): /* For now, the caller is required to have CAP_NET_ADMIN in * the user namespace owning the target net ns. */ if (!sk_ns_capable(sk, net->user_ns, CAP_NET_ADMIN)) return ERR_PTR(-EACCES); A user with CAP_NET_ADMIN in their own user namespace can therefore push a wiphy into an arbitrary netns (including init_net) over which they have no privilege. Mirror the rtnetlink convention by requiring CAP_NET_ADMIN in the target netns before calling cfg80211_switch_netns(). Signed-off-by: Maoyi Xie Link: https://patch.msgid.link/20260506064854.2207105-2-maoyixie.tju@gmail.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 67088804dcc7..db546dd93d08 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -13867,6 +13867,19 @@ static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(net)) return PTR_ERR(net); + /* + * The caller already has CAP_NET_ADMIN over the source netns + * (enforced by GENL_UNS_ADMIN_PERM on the genl op). Mirror the + * convention used by net/core/rtnetlink.c::rtnl_get_net_ns_capable() + * and require CAP_NET_ADMIN over the target netns as well, so that + * a caller that is privileged in their own user namespace cannot + * push a wiphy into a netns where they have no privilege. + */ + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { + put_net(net); + return -EPERM; + } + err = 0; /* check if anything to do */ -- cgit v1.2.3 From 79240f3f6d766b342b57c32397d643e1cfa26b81 Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Wed, 6 May 2026 14:48:54 +0800 Subject: wifi: nl80211: re-check wiphy netns in nl80211_prepare_wdev_dump() continuation NL80211_CMD_GET_SCAN is implemented as a multi-call dumpit. The first invocation of nl80211_prepare_wdev_dump() validates the requested wdev against the caller's netns via __cfg80211_wdev_from_attrs(). Subsequent invocations look up the same wiphy by its global index and do not check that the wiphy is still in the caller's netns. Add the same filter to the continuation path. If the wiphy's netns no longer matches the caller's, return -ENODEV and the netlink dump machinery terminates the walk cleanly. Signed-off-by: Maoyi Xie Link: https://patch.msgid.link/20260506064854.2207105-3-maoyixie.tju@gmail.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index db546dd93d08..7db9cd433801 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1276,6 +1276,18 @@ static int nl80211_prepare_wdev_dump(struct netlink_callback *cb, rtnl_unlock(); return -ENODEV; } + + /* + * The first invocation validated the wdev's netns against + * the caller via __cfg80211_wdev_from_attrs(). The wiphy + * may have moved netns between dumpit invocations (via + * NL80211_CMD_SET_WIPHY_NETNS), so re-check here. + */ + if (!net_eq(wiphy_net(wiphy), sock_net(cb->skb->sk))) { + rtnl_unlock(); + return -ENODEV; + } + *rdev = wiphy_to_rdev(wiphy); *wdev = NULL; -- cgit v1.2.3 From b819db93d73f4593636299e229914052b89e3ef2 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sun, 12 Apr 2026 21:47:42 +0300 Subject: Bluetooth: SCO: fix sleeping under spinlock in sco_conn_ready sco_conn_ready calls sleeping functions under conn->lock spinlock. The critical section can be reduced: conn->hcon is modified only with hdev->lock held. It is guaranteed to be held in sco_conn_ready, so conn->lock is not needed to guard it. Move taking conn->lock after lock_sock(parent). This also follows the lock ordering lock_sock() > conn->lock elsewhere in the file. Fixes: 27c24fda62b60 ("Bluetooth: switch to lock_sock in SCO") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/sco.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 18826d4b9c0b..3a5479538e85 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -1377,26 +1377,24 @@ static void sco_conn_ready(struct sco_conn *conn) sk->sk_state_change(sk); release_sock(sk); } else { - sco_conn_lock(conn); - - if (!conn->hcon) { - sco_conn_unlock(conn); + if (!conn->hcon) return; - } + + lockdep_assert_held(&conn->hcon->hdev->lock); parent = sco_get_sock_listen(&conn->hcon->src); - if (!parent) { - sco_conn_unlock(conn); + if (!parent) return; - } lock_sock(parent); + sco_conn_lock(conn); + sk = sco_sock_alloc(sock_net(parent), NULL, BTPROTO_SCO, GFP_ATOMIC, 0); if (!sk) { - release_sock(parent); sco_conn_unlock(conn); + release_sock(parent); return; } @@ -1417,9 +1415,9 @@ static void sco_conn_ready(struct sco_conn *conn) /* Wake up parent */ parent->sk_data_ready(parent); - release_sock(parent); - sco_conn_unlock(conn); + + release_sock(parent); } } -- cgit v1.2.3 From 0beddb0c380bed5f5b8e61ddbe14635bb73d0b41 Mon Sep 17 00:00:00 2001 From: David Carlier Date: Sun, 12 Apr 2026 21:29:16 +0100 Subject: Bluetooth: hci_conn: fix potential UAF in create_big_sync Add hci_conn_valid() check in create_big_sync() to detect stale connections before proceeding with BIG creation. Handle the resulting -ECANCELED in create_big_complete() and re-validate the connection under hci_dev_lock() before dereferencing, matching the pattern used by create_le_conn_complete() and create_pa_complete(). Keep the hci_conn object alive across the async boundary by taking a reference via hci_conn_get() when queueing create_big_sync(), and dropping it in the completion callback. The refcount and the lock are complementary: the refcount keeps the object allocated, while hci_dev_lock() serializes hci_conn_hash_del()'s list_del_rcu() on hdev->conn_hash, as required by hci_conn_del(). hci_conn_put() is called outside hci_dev_unlock() so the final put (which resolves to kfree() via bt_link_release) does not run under hdev->lock, though the release path would be safe either way. Without this, create_big_complete() would unconditionally dereference the conn pointer on error, causing a use-after-free via hci_connect_cfm() and hci_conn_del(). Fixes: eca0ae4aea66 ("Bluetooth: Add initial implementation of BIS connections") Cc: stable@vger.kernel.org Co-developed-by: Luiz Augusto von Dentz Signed-off-by: Luiz Augusto von Dentz Signed-off-by: David Carlier Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 3a0592599086..96e345fcf303 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -2130,6 +2130,9 @@ static int create_big_sync(struct hci_dev *hdev, void *data) u32 flags = 0; int err; + if (!hci_conn_valid(hdev, conn)) + return -ECANCELED; + if (qos->bcast.out.phys == BIT(1)) flags |= MGMT_ADV_FLAG_SEC_2M; @@ -2204,11 +2207,24 @@ static void create_big_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "conn %p", conn); + if (err == -ECANCELED) + goto done; + + hci_dev_lock(hdev); + + if (!hci_conn_valid(hdev, conn)) + goto unlock; + if (err) { bt_dev_err(hdev, "Unable to create BIG: %d", err); hci_connect_cfm(conn, err); hci_conn_del(conn); } + +unlock: + hci_dev_unlock(hdev); +done: + hci_conn_put(conn); } struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, @@ -2336,10 +2352,11 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, BT_BOUND, &data); /* Queue start periodic advertising and create BIG */ - err = hci_cmd_sync_queue(hdev, create_big_sync, conn, + err = hci_cmd_sync_queue(hdev, create_big_sync, hci_conn_get(conn), create_big_complete); if (err < 0) { hci_conn_drop(conn); + hci_conn_put(conn); return ERR_PTR(err); } -- cgit v1.2.3 From 5ddb8014261137cadaf83ab5617a588d80a22586 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 10 Apr 2026 15:29:52 -0400 Subject: Bluetooth: hci_event: Fix OOB read and infinite loop in hci_le_create_big_complete_evt hci_le_create_big_complete_evt() iterates over BT_BOUND connections for a BIG handle using a while loop, accessing ev->bis_handle[i++] on each iteration. However, there is no check that i stays within ev->num_bis before the array access. When a controller sends a LE_Create_BIG_Complete event with fewer bis_handle entries than there are BT_BOUND connections for that BIG, or with num_bis=0, the loop reads beyond the valid bis_handle[] flex array into adjacent heap memory. Since the out-of-bounds values typically exceed HCI_CONN_HANDLE_MAX (0x0EFF), hci_conn_set_handle() rejects them and the connection remains in BT_BOUND state. The same connection is then found again by hci_conn_hash_lookup_big_state(), creating an infinite loop with hci_dev_lock held. Fix this by terminating the BIG if in case not all BIS could be setup properly. Fixes: a0bfde167b50 ("Bluetooth: ISO: Add support for connecting multiple BISes") Cc: stable@vger.kernel.org Signed-off-by: ZhiTao Ou Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index b2ee6b6a0f56..1b3b9131affa 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -7118,9 +7118,29 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, continue; } + if (ev->num_bis <= i) { + bt_dev_err(hdev, + "Not enough BIS handles for BIG 0x%2.2x", + ev->handle); + ev->status = HCI_ERROR_UNSPECIFIED; + hci_connect_cfm(conn, ev->status); + hci_conn_del(conn); + continue; + } + if (hci_conn_set_handle(conn, - __le16_to_cpu(ev->bis_handle[i++]))) + __le16_to_cpu(ev->bis_handle[i++]))) { + bt_dev_err(hdev, + "Failed to set BIS handle for BIG 0x%2.2x", + ev->handle); + /* Force error so BIG gets terminated as not all BIS + * could be connected. + */ + ev->status = HCI_ERROR_UNSPECIFIED; + hci_connect_cfm(conn, ev->status); + hci_conn_del(conn); continue; + } conn->state = BT_CONNECTED; set_bit(HCI_CONN_BIG_CREATED, &conn->flags); @@ -7129,7 +7149,10 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, hci_iso_setup_path(conn); } - if (!ev->status && !i) + /* If there is an unexpected error or if no BISes have been connected + * for the BIG, terminate it. + */ + if (ev->status == HCI_ERROR_UNSPECIFIED || (!ev->status && !i)) /* If no BISes have been connected for the BIG, * terminate. This is in case all bound connections * have been closed before the BIG creation -- cgit v1.2.3 From 72b8deccff17a7644e0367e1aaf1a36cfb014324 Mon Sep 17 00:00:00 2001 From: Dudu Lu Date: Wed, 15 Apr 2026 17:39:53 +0800 Subject: Bluetooth: bnep: fix incorrect length parsing in bnep_rx_frame() extension handling In bnep_rx_frame(), the BNEP_FILTER_NET_TYPE_SET and BNEP_FILTER_MULTI_ADDR_SET extension header parsing has two bugs: 1) The 2-byte length field is read with *(u16 *)(skb->data + 1), which performs a native-endian read. The BNEP protocol specifies this field in big-endian (network byte order), and the same file correctly uses get_unaligned_be16() for the identical fields in bnep_ctrl_set_netfilter() and bnep_ctrl_set_mcfilter(). 2) The length is multiplied by 2, but unlike BNEP_SETUP_CONN_REQ where the length byte counts UUID pairs (requiring * 2 for two UUIDs per entry), the filter extension length field already represents the total data size in bytes. This is confirmed by bnep_ctrl_set_netfilter() which reads the same field as a byte count and divides by 4 to get the number of filter entries. The bogus * 2 means skb_pull advances twice as far as it should, either dropping valid data from the next header or causing the pull to fail entirely when the doubled length exceeds the remaining skb. Fix by splitting the pull into two steps: first use skb_pull_data() to safely pull and validate the 3-byte fixed header (ctrl type + length), then pull the variable-length data using the properly decoded length. Fixes: bf8b9a9cb77b ("Bluetooth: bnep: Add support to extended headers of control frames") Signed-off-by: Dudu Lu Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/bnep/core.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index d44987d4515c..853c8d7644b5 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -330,11 +330,18 @@ static int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb) goto badframe; break; case BNEP_FILTER_MULTI_ADDR_SET: - case BNEP_FILTER_NET_TYPE_SET: - /* Pull: ctrl type (1 b), len (2 b), data (len bytes) */ - if (!skb_pull(skb, 3 + *(u16 *)(skb->data + 1) * 2)) + case BNEP_FILTER_NET_TYPE_SET: { + u8 *hdr; + + /* Pull ctrl type (1 b) + len (2 b) */ + hdr = skb_pull_data(skb, 3); + if (!hdr) + goto badframe; + /* Pull data (len bytes); length is big-endian */ + if (!skb_pull(skb, get_unaligned_be16(&hdr[1]))) goto badframe; break; + } default: kfree_skb(skb); return 0; -- cgit v1.2.3 From 4f42363c814f28fe3f59847c35acf1ed033bedd4 Mon Sep 17 00:00:00 2001 From: Dudu Lu Date: Wed, 15 Apr 2026 18:43:55 +0800 Subject: Bluetooth: l2cap: fix MPS check in l2cap_ecred_reconf_req The L2CAP specification states that if more than one channel is being reconfigured, the MPS shall not be decreased. The current check has two issues: 1) The comparison uses >= (greater-than-or-equal), which incorrectly rejects reconfiguration requests where the MPS stays the same. Since the spec says MPS "shall be greater than or equal to the current MPS", only a strict decrease (remote_mps > mps) should be rejected. Keeping the same MPS is valid. 2) The multi-channel guard uses `&& i` (loop index) to approximate "more than one channel", but this incorrectly allows MPS decrease for the first channel (i==0) even when multiple channels are being reconfigured. Replace with `&& num_scid > 1` which correctly checks whether the request covers more than one channel. Fixes: 7accb1c4321a ("Bluetooth: L2CAP: Fix invalid response to L2CAP_ECRED_RECONF_REQ") Signed-off-by: Dudu Lu Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 77dec104a9c3..b15374b951fa 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -5428,7 +5428,7 @@ static inline int l2cap_ecred_reconf_req(struct l2cap_conn *conn, * configured, the MPS field may be less than the current MPS * of that channel. */ - if (chan[i]->remote_mps >= mps && i) { + if (chan[i]->remote_mps > mps && num_scid > 1) { BT_ERR("chan %p decreased MPS %u -> %u", chan[i], chan[i]->remote_mps, mps); result = L2CAP_RECONF_INVALID_MPS; -- cgit v1.2.3 From 91b5a598b5285da794b72619f31777b62dd336f8 Mon Sep 17 00:00:00 2001 From: Mikhail Gavrilov Date: Wed, 15 Apr 2026 02:52:37 +0500 Subject: Bluetooth: l2cap: defer conn param update to avoid conn->lock/hdev->lock inversion When a BLE peripheral sends an L2CAP Connection Parameter Update Request the processing path is: process_pending_rx() [takes conn->lock] l2cap_le_sig_channel() l2cap_conn_param_update_req() hci_le_conn_update() [takes hdev->lock] Meanwhile other code paths take the locks in the opposite order: l2cap_chan_connect() [takes hdev->lock] ... mutex_lock(&conn->lock) l2cap_conn_ready() [hdev->lock via hci_cb_list_lock] ... mutex_lock(&conn->lock) This is a classic AB/BA deadlock which lockdep reports as a circular locking dependency when connecting a BLE MIDI keyboard (Carry-On FC-49). Fix this by making hci_le_conn_update() defer the HCI command through hci_cmd_sync_queue() so it no longer needs to take hdev->lock in the caller context. The sync callback uses __hci_cmd_sync_status_sk() to wait for the HCI_EV_LE_CONN_UPDATE_COMPLETE event, then updates the stored connection parameters (hci_conn_params) and notifies userspace (mgmt_new_conn_param) only after the controller has confirmed the update. A reference on hci_conn is held via hci_conn_get()/hci_conn_put() for the lifetime of the queued work to prevent use-after-free, and hci_conn_valid() is checked before proceeding in case the connection was removed while the work was pending. The hci_dev_lock is held across hci_conn_valid() and all conn field accesses to prevent a concurrent disconnect from invalidating the connection mid-use. Fixes: f044eb0524a0 ("Bluetooth: Store latency and supervision timeout in connection params") Signed-off-by: Mikhail Gavrilov Reviewed-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 2 +- net/bluetooth/hci_conn.c | 105 ++++++++++++++++++++++++++++++++------- net/bluetooth/l2cap_core.c | 12 +---- 3 files changed, 89 insertions(+), 30 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index a7bffb908c1e..aa600fbf9a53 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -2495,7 +2495,7 @@ void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle, bdaddr_t *bdaddr, u8 addr_type); int hci_abort_conn(struct hci_conn *conn, u8 reason); -u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, +void hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, u16 to_multiplier); void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand, __u8 ltk[16], __u8 key_size); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 96e345fcf303..17b46ad6a349 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -480,40 +480,107 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle) return hci_setup_sync_conn(conn, handle); } -u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, - u16 to_multiplier) +struct le_conn_update_data { + struct hci_conn *conn; + u16 min; + u16 max; + u16 latency; + u16 to_multiplier; +}; + +static int le_conn_update_sync(struct hci_dev *hdev, void *data) { - struct hci_dev *hdev = conn->hdev; + struct le_conn_update_data *d = data; + struct hci_conn *conn = d->conn; struct hci_conn_params *params; struct hci_cp_le_conn_update cp; + u16 timeout; + u8 store_hint; + int err; + /* Verify connection is still alive and read conn fields under + * the same lock to prevent a concurrent disconnect from freeing + * or reusing the connection while we build the HCI command. + */ hci_dev_lock(hdev); - params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); - if (params) { - params->conn_min_interval = min; - params->conn_max_interval = max; - params->conn_latency = latency; - params->supervision_timeout = to_multiplier; + if (!hci_conn_valid(hdev, conn)) { + hci_dev_unlock(hdev); + return -ECANCELED; } - hci_dev_unlock(hdev); - memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(conn->handle); - cp.conn_interval_min = cpu_to_le16(min); - cp.conn_interval_max = cpu_to_le16(max); - cp.conn_latency = cpu_to_le16(latency); - cp.supervision_timeout = cpu_to_le16(to_multiplier); + cp.conn_interval_min = cpu_to_le16(d->min); + cp.conn_interval_max = cpu_to_le16(d->max); + cp.conn_latency = cpu_to_le16(d->latency); + cp.supervision_timeout = cpu_to_le16(d->to_multiplier); cp.min_ce_len = cpu_to_le16(0x0000); cp.max_ce_len = cpu_to_le16(0x0000); + timeout = conn->conn_timeout; - hci_send_cmd(hdev, HCI_OP_LE_CONN_UPDATE, sizeof(cp), &cp); + hci_dev_unlock(hdev); - if (params) - return 0x01; + err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CONN_UPDATE, + sizeof(cp), &cp, + HCI_EV_LE_CONN_UPDATE_COMPLETE, + timeout, NULL); + if (err) + return err; - return 0x00; + /* Update stored connection parameters after the controller has + * confirmed the update via the LE Connection Update Complete event. + */ + hci_dev_lock(hdev); + + params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); + if (params) { + params->conn_min_interval = d->min; + params->conn_max_interval = d->max; + params->conn_latency = d->latency; + params->supervision_timeout = d->to_multiplier; + store_hint = 0x01; + } else { + store_hint = 0x00; + } + + hci_dev_unlock(hdev); + + mgmt_new_conn_param(hdev, &conn->dst, conn->dst_type, store_hint, + d->min, d->max, d->latency, d->to_multiplier); + + return 0; +} + +static void le_conn_update_complete(struct hci_dev *hdev, void *data, int err) +{ + struct le_conn_update_data *d = data; + + hci_conn_put(d->conn); + kfree(d); +} + +void hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, + u16 to_multiplier) +{ + struct le_conn_update_data *d; + + d = kzalloc_obj(*d); + if (!d) + return; + + hci_conn_get(conn); + d->conn = conn; + d->min = min; + d->max = max; + d->latency = latency; + d->to_multiplier = to_multiplier; + + if (hci_cmd_sync_queue(conn->hdev, le_conn_update_sync, d, + le_conn_update_complete) < 0) { + hci_conn_put(conn); + kfree(d); + } } void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand, diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index b15374b951fa..7701528f1167 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -4706,16 +4706,8 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn, l2cap_send_cmd(conn, cmd->ident, L2CAP_CONN_PARAM_UPDATE_RSP, sizeof(rsp), &rsp); - if (!err) { - u8 store_hint; - - store_hint = hci_le_conn_update(hcon, min, max, latency, - to_multiplier); - mgmt_new_conn_param(hcon->hdev, &hcon->dst, hcon->dst_type, - store_hint, min, max, latency, - to_multiplier); - - } + if (!err) + hci_le_conn_update(hcon, min, max, latency, to_multiplier); return 0; } -- cgit v1.2.3 From 2ff1a41a912de8517b4482e946dd951b7d80edbf Mon Sep 17 00:00:00 2001 From: Siwei Zhang Date: Wed, 15 Apr 2026 16:51:36 -0400 Subject: Bluetooth: L2CAP: Fix null-ptr-deref in l2cap_sock_state_change_cb() Add the same NULL guard already present in l2cap_sock_resume_cb() and l2cap_sock_ready_cb(). Fixes: 89bc500e41fc ("Bluetooth: Add state tracking to struct l2cap_chan") Cc: stable@kernel.org Signed-off-by: Siwei Zhang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_sock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 71e8c1b45bce..fb3cb70a5a39 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1657,6 +1657,9 @@ static void l2cap_sock_state_change_cb(struct l2cap_chan *chan, int state, { struct sock *sk = chan->data; + if (!sk) + return; + sk->sk_state = state; if (err) -- cgit v1.2.3 From 78a88d43dab8d23aeef934ed8ce34d40e6b3d613 Mon Sep 17 00:00:00 2001 From: Siwei Zhang Date: Wed, 15 Apr 2026 16:53:36 -0400 Subject: Bluetooth: L2CAP: Fix null-ptr-deref in l2cap_sock_get_sndtimeo_cb() Add the same NULL guard already present in l2cap_sock_resume_cb() and l2cap_sock_ready_cb(). Fixes: 8d836d71e222 ("Bluetooth: Access sk_sndtimeo indirectly in l2cap_core.c") Cc: stable@kernel.org Signed-off-by: Siwei Zhang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_sock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index fb3cb70a5a39..879c9f90269a 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1761,6 +1761,9 @@ static long l2cap_sock_get_sndtimeo_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; + if (!sk) + return 0; + return READ_ONCE(sk->sk_sndtimeo); } -- cgit v1.2.3 From 0a120d96166301d7a95be75b52f843837dbd1219 Mon Sep 17 00:00:00 2001 From: Siwei Zhang Date: Wed, 15 Apr 2026 16:49:59 -0400 Subject: Bluetooth: L2CAP: Fix null-ptr-deref in l2cap_sock_new_connection_cb() Add the same NULL guard already present in l2cap_sock_resume_cb() and l2cap_sock_ready_cb(). Fixes: 80808e431e1e ("Bluetooth: Add l2cap_chan_ops abstraction") Cc: stable@kernel.org Signed-off-by: Siwei Zhang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_sock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 879c9f90269a..cf590a67d364 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1498,6 +1498,9 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan) { struct sock *sk, *parent = chan->data; + if (!parent) + return NULL; + lock_sock(parent); /* Check for backlog size */ -- cgit v1.2.3 From 4e37f6452d586b95c346a9abdd2fb80b67794f39 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sat, 18 Apr 2026 18:41:12 +0300 Subject: Bluetooth: SCO: hold sk properly in sco_conn_ready sk deref in sco_conn_ready must be done either under conn->lock, or holding a refcount, to avoid concurrent close. conn->sk and parent sk is currently accessed without either, and without checking parent->sk_state: [Task 1] [Task 2] sco_sock_release sco_conn_ready sk = conn->sk lock_sock(sk) conn->sk = NULL lock_sock(sk) release_sock(sk) sco_sock_kill(sk) UAF on sk deref and similarly for access to sco_get_sock_listen() return value. Fix possible UAF by holding sk refcount in sco_conn_ready() and making sco_get_sock_listen() increase refcount. Also recheck after lock_sock that the socket is still valid. Adjust conn->sk locking so it's protected also by lock_sock() of the associated socket if any. Fixes: 27c24fda62b60 ("Bluetooth: switch to lock_sock in SCO") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/sco.c | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 3a5479538e85..eba44525d41d 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -472,9 +472,13 @@ static struct sock *sco_get_sock_listen(bdaddr_t *src) sk1 = sk; } + sk = sk ? sk : sk1; + if (sk) + sock_hold(sk); + read_unlock(&sco_sk_list.lock); - return sk ? sk : sk1; + return sk; } static void sco_sock_destruct(struct sock *sk) @@ -515,11 +519,13 @@ static void sco_sock_kill(struct sock *sk) BT_DBG("sk %p state %d", sk, sk->sk_state); /* Sock is dead, so set conn->sk to NULL to avoid possible UAF */ + lock_sock(sk); if (sco_pi(sk)->conn) { sco_conn_lock(sco_pi(sk)->conn); sco_pi(sk)->conn->sk = NULL; sco_conn_unlock(sco_pi(sk)->conn); } + release_sock(sk); /* Kill poor orphan */ bt_sock_unlink(&sco_sk_list, sk); @@ -1365,17 +1371,28 @@ static int sco_sock_release(struct socket *sock) static void sco_conn_ready(struct sco_conn *conn) { - struct sock *parent; - struct sock *sk = conn->sk; + struct sock *parent, *sk; + + sco_conn_lock(conn); + sk = sco_sock_hold(conn); + sco_conn_unlock(conn); BT_DBG("conn %p", conn); if (sk) { lock_sock(sk); - sco_sock_clear_timer(sk); - sk->sk_state = BT_CONNECTED; - sk->sk_state_change(sk); + + /* conn->sk may have become NULL if racing with sk close, but + * due to held hdev->lock, it can't become different sk. + */ + if (conn->sk) { + sco_sock_clear_timer(sk); + sk->sk_state = BT_CONNECTED; + sk->sk_state_change(sk); + } + release_sock(sk); + sock_put(sk); } else { if (!conn->hcon) return; @@ -1390,13 +1407,15 @@ static void sco_conn_ready(struct sco_conn *conn) sco_conn_lock(conn); + /* hdev->lock guarantees conn->sk == NULL still here */ + + if (parent->sk_state != BT_LISTEN) + goto release; + sk = sco_sock_alloc(sock_net(parent), NULL, BTPROTO_SCO, GFP_ATOMIC, 0); - if (!sk) { - sco_conn_unlock(conn); - release_sock(parent); - return; - } + if (!sk) + goto release; sco_sock_init(sk, parent); @@ -1415,9 +1434,10 @@ static void sco_conn_ready(struct sco_conn *conn) /* Wake up parent */ parent->sk_data_ready(parent); +release: sco_conn_unlock(conn); - release_sock(parent); + sock_put(parent); } } -- cgit v1.2.3 From 5917dd39db2bfc8b1b4c6ea8ed99adb4badef707 Mon Sep 17 00:00:00 2001 From: Sai Teja Aluvala Date: Mon, 20 Apr 2026 23:07:35 +0530 Subject: Bluetooth: btintel_pcie: treat boot stage bit 12 as warning CSR boot stage register bit 12 is documented as a device warning, not a fatal error. Rename the bit definition accordingly and stop including it in btintel_pcie_in_error(). This keeps warning-only boot stage values from being classified as errors while preserving abort-handler state as the actual error condition. Fixes: 190377500fde ("Bluetooth: btintel_pcie: Dump debug registers on error") Signed-off-by: Kiran K Signed-off-by: Sai Teja Aluvala Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btintel_pcie.c | 13 ++++++++++--- drivers/bluetooth/btintel_pcie.h | 2 +- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/bluetooth/btintel_pcie.c b/drivers/bluetooth/btintel_pcie.c index 2f59c0d6f9ec..a3643e67b33f 100644 --- a/drivers/bluetooth/btintel_pcie.c +++ b/drivers/bluetooth/btintel_pcie.c @@ -289,6 +289,9 @@ static inline void btintel_pcie_dump_debug_registers(struct hci_dev *hdev) skb_put_data(skb, buf, strlen(buf)); data->boot_stage_cache = reg; + if (reg & BTINTEL_PCIE_CSR_BOOT_STAGE_DEVICE_WARNING) + bt_dev_warn(hdev, "Controller device warning (boot_stage: 0x%8.8x)", reg); + reg = btintel_pcie_rd_reg32(data, BTINTEL_PCIE_CSR_IPC_STATUS_REG); snprintf(buf, sizeof(buf), "ipc status: 0x%8.8x", reg); skb_put_data(skb, buf, strlen(buf)); @@ -880,8 +883,11 @@ static inline bool btintel_pcie_in_lockdown(struct btintel_pcie_data *data) static inline bool btintel_pcie_in_error(struct btintel_pcie_data *data) { - return (data->boot_stage_cache & BTINTEL_PCIE_CSR_BOOT_STAGE_DEVICE_ERR) || - (data->boot_stage_cache & BTINTEL_PCIE_CSR_BOOT_STAGE_ABORT_HANDLER); + if (data->boot_stage_cache & BTINTEL_PCIE_CSR_BOOT_STAGE_DEVICE_WARNING) + bt_dev_warn(data->hdev, "Controller device warning (boot_stage: 0x%8.8x)", + data->boot_stage_cache); + + return data->boot_stage_cache & BTINTEL_PCIE_CSR_BOOT_STAGE_ABORT_HANDLER; } static void btintel_pcie_msix_gp1_handler(struct btintel_pcie_data *data) @@ -914,7 +920,8 @@ static void btintel_pcie_msix_gp0_handler(struct btintel_pcie_data *data) data->img_resp_cache = reg; if (btintel_pcie_in_error(data)) { - bt_dev_err(data->hdev, "Controller in error state"); + bt_dev_err(data->hdev, "Controller in error state (boot_stage: 0x%8.8x)", + data->boot_stage_cache); btintel_pcie_dump_debug_registers(data->hdev); return; } diff --git a/drivers/bluetooth/btintel_pcie.h b/drivers/bluetooth/btintel_pcie.h index 3c7bb708362d..f922abd1e7d8 100644 --- a/drivers/bluetooth/btintel_pcie.h +++ b/drivers/bluetooth/btintel_pcie.h @@ -48,7 +48,7 @@ #define BTINTEL_PCIE_CSR_BOOT_STAGE_OPFW (BIT(2)) #define BTINTEL_PCIE_CSR_BOOT_STAGE_ROM_LOCKDOWN (BIT(10)) #define BTINTEL_PCIE_CSR_BOOT_STAGE_IML_LOCKDOWN (BIT(11)) -#define BTINTEL_PCIE_CSR_BOOT_STAGE_DEVICE_ERR (BIT(12)) +#define BTINTEL_PCIE_CSR_BOOT_STAGE_DEVICE_WARNING (BIT(12)) #define BTINTEL_PCIE_CSR_BOOT_STAGE_ABORT_HANDLER (BIT(13)) #define BTINTEL_PCIE_CSR_BOOT_STAGE_DEVICE_HALTED (BIT(14)) #define BTINTEL_PCIE_CSR_BOOT_STAGE_MAC_ACCESS_ON (BIT(16)) -- cgit v1.2.3 From 902fe40bce7059722f7ffa1c378e577675cf1918 Mon Sep 17 00:00:00 2001 From: Aurelien DESBRIERES Date: Tue, 21 Apr 2026 15:53:31 +0200 Subject: Bluetooth: hci_uart: Fix NULL deref in recv callbacks when priv is uninitialized When a fault is injected during hci_uart line discipline setup, the proto open() callback may fail leaving hu->priv as NULL. A subsequent TIOCSTI ioctl can trigger the recv() callback before priv is initialized, causing a NULL pointer dereference. Fix all four affected HCI UART protocol drivers by adding a NULL check on hu->priv at the start of their recv() callbacks: h4, h5, ath and bcsp. Reported-by: syzbot+ff30eeab8e07b37d524e@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=ff30eeab8e07b37d524e Signed-off-by: Aurelien DESBRIERES Assisted-by: Claude:claude-sonnet-4-6 Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/hci_ath.c | 3 +++ drivers/bluetooth/hci_bcsp.c | 3 +++ drivers/bluetooth/hci_h4.c | 3 +++ drivers/bluetooth/hci_h5.c | 3 +++ 4 files changed, 12 insertions(+) diff --git a/drivers/bluetooth/hci_ath.c b/drivers/bluetooth/hci_ath.c index fa679ad0acdf..8201fa7f61e8 100644 --- a/drivers/bluetooth/hci_ath.c +++ b/drivers/bluetooth/hci_ath.c @@ -191,6 +191,9 @@ static int ath_recv(struct hci_uart *hu, const void *data, int count) { struct ath_struct *ath = hu->priv; + if (!ath) + return -ENODEV; + ath->rx_skb = h4_recv_buf(hu, ath->rx_skb, data, count, ath_recv_pkts, ARRAY_SIZE(ath_recv_pkts)); if (IS_ERR(ath->rx_skb)) { diff --git a/drivers/bluetooth/hci_bcsp.c b/drivers/bluetooth/hci_bcsp.c index b386f91d8b46..db56eead27ce 100644 --- a/drivers/bluetooth/hci_bcsp.c +++ b/drivers/bluetooth/hci_bcsp.c @@ -585,6 +585,9 @@ static int bcsp_recv(struct hci_uart *hu, const void *data, int count) if (!test_bit(HCI_UART_REGISTERED, &hu->flags)) return -EUNATCH; + if (!bcsp) + return -ENODEV; + BT_DBG("hu %p count %d rx_state %d rx_count %ld", hu, count, bcsp->rx_state, bcsp->rx_count); diff --git a/drivers/bluetooth/hci_h4.c b/drivers/bluetooth/hci_h4.c index a889a66a326f..767372707498 100644 --- a/drivers/bluetooth/hci_h4.c +++ b/drivers/bluetooth/hci_h4.c @@ -109,6 +109,9 @@ static int h4_recv(struct hci_uart *hu, const void *data, int count) { struct h4_struct *h4 = hu->priv; + if (!h4) + return -ENODEV; + h4->rx_skb = h4_recv_buf(hu, h4->rx_skb, data, count, h4_recv_pkts, ARRAY_SIZE(h4_recv_pkts)); if (IS_ERR(h4->rx_skb)) { diff --git a/drivers/bluetooth/hci_h5.c b/drivers/bluetooth/hci_h5.c index cfdf75dc2847..d35383718212 100644 --- a/drivers/bluetooth/hci_h5.c +++ b/drivers/bluetooth/hci_h5.c @@ -587,6 +587,9 @@ static int h5_recv(struct hci_uart *hu, const void *data, int count) struct h5 *h5 = hu->priv; const unsigned char *ptr = data; + if (!h5) + return -ENODEV; + BT_DBG("%s pending %zu count %d", hu->hdev->name, h5->rx_pending, count); -- cgit v1.2.3 From ca40d481079c05c6891a14a798c79596fd2d5f0c Mon Sep 17 00:00:00 2001 From: SeungJu Cheon Date: Tue, 21 Apr 2026 11:51:21 +0900 Subject: Bluetooth: ISO: Fix data-race on dst in iso_sock_connect() iso_sock_connect() copies the destination address into iso_pi(sk)->dst under lock_sock, then releases the lock and reads it back with bacmp() to decide between the CIS and BIS connect paths: lock_sock(sk); bacpy(&iso_pi(sk)->dst, &sa->iso_bdaddr); iso_pi(sk)->dst_type = sa->iso_bdaddr_type; release_sock(sk); if (bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) // <- no lock held This read after release_sock() races with any concurrent write to iso_pi(sk)->dst on the same socket. Fix by reading the destination address directly from the local sockaddr argument (sa->iso_bdaddr) instead of iso_pi(sk)->dst. Since sa is a function-local argument, reading it requires no locking and avoids the race. This patch addresses only the bacmp() race in iso_sock_connect(); other unprotected iso_pi(sk) accesses are fixed separately in the next patch. KCSAN report: BUG: KCSAN: data-race in memcmp+0x39/0xb0 race at unknown origin, with read to 0xffff8f96ea66dde3 of 1 bytes by task 549 on cpu 1: memcmp+0x39/0xb0 iso_sock_connect+0x275/0xb40 __sys_connect_file+0xbd/0xe0 __sys_connect+0xe0/0x110 __x64_sys_connect+0x40/0x50 x64_sys_call+0xcad/0x1c60 do_syscall_64+0x133/0x590 entry_SYSCALL_64_after_hwframe+0x77/0x7f value changed: 0x00 -> 0xee Reported by Kernel Concurrency Sanitizer on: CPU: 1 UID: 0 PID: 549 Comm: iso_race_combin Not tainted 7.0.0-08391-g1d51b370a0f8 #40 PREEMPT(lazy) Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type") Signed-off-by: SeungJu Cheon Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index be145e2736b7..290a1b9a9daa 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1193,7 +1193,7 @@ static int iso_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, release_sock(sk); - if (bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) + if (bacmp(&sa->iso_bdaddr, BDADDR_ANY)) err = iso_connect_cis(sk); else err = iso_connect_bis(sk); -- cgit v1.2.3 From f958c7805b18e9d69f6b322b231ecee46ec6f331 Mon Sep 17 00:00:00 2001 From: SeungJu Cheon Date: Tue, 21 Apr 2026 11:51:22 +0900 Subject: Bluetooth: ISO: Fix data-race on iso_pi(sk) in socket and HCI event paths Several iso_pi(sk) fields (qos, qos_user_set, bc_sid, base, base_len, sync_handle, bc_num_bis) are written under lock_sock in iso_sock_setsockopt() and iso_sock_bind(), but read and written under hci_dev_lock only in two other paths: - iso_connect_bis() / iso_connect_cis(), invoked from connect(2), read qos/base/bc_sid and reset qos to default_qos on the qos_user_set validation failure -- all without lock_sock. - iso_connect_ind(), invoked from hci_rx_work, writes sync_handle, bc_sid, qos.bcast.encryption, bc_num_bis, base and base_len on PA_SYNC_ESTABLISHED / PAST_RECEIVED / BIG_INFO_ADV_REPORT / PER_ADV_REPORT events. The BIG_INFO handler additionally passes &iso_pi(sk)->qos together with sync_handle / bc_num_bis / bc_bis to hci_conn_big_create_sync() while setsockopt may be mutating them. Acquire lock_sock around the affected accesses in both paths. The locking order hci_dev_lock -> lock_sock matches the existing iso_conn_big_sync() precedent, whose comment documents the same requirement for hci_conn_big_create_sync(). The HCI connect/bind helpers do not wait for command completion -- they enqueue work via hci_cmd_sync_queue{,_once}() / hci_le_create_cis_pending() and return -- so the added hold time is comparable to iso_conn_big_sync(). KCSAN report: BUG: KCSAN: data-race in iso_connect_cis / iso_sock_setsockopt read to 0xffffa3ae8ce3cdc8 of 1 bytes by task 335 on cpu 0: iso_connect_cis+0x49f/0xa20 iso_sock_connect+0x60e/0xb40 __sys_connect_file+0xbd/0xe0 __sys_connect+0xe0/0x110 __x64_sys_connect+0x40/0x50 x64_sys_call+0xcad/0x1c60 do_syscall_64+0x133/0x590 entry_SYSCALL_64_after_hwframe+0x77/0x7f write to 0xffffa3ae8ce3cdc8 of 60 bytes by task 334 on cpu 1: iso_sock_setsockopt+0x69a/0x930 do_sock_setsockopt+0xc3/0x170 __sys_setsockopt+0xd1/0x130 __x64_sys_setsockopt+0x64/0x80 x64_sys_call+0x1547/0x1c60 do_syscall_64+0x133/0x590 entry_SYSCALL_64_after_hwframe+0x77/0x7f Reported by Kernel Concurrency Sanitizer on: CPU: 1 UID: 0 PID: 334 Comm: iso_setup_race Not tainted 7.0.0-10949-g8541d8f725c6 #44 PREEMPT(lazy) The iso_connect_ind() races were found by inspection. Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type") Signed-off-by: SeungJu Cheon Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 54 +++++++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 290a1b9a9daa..7cb2864fe872 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -347,6 +347,7 @@ static int iso_connect_bis(struct sock *sk) return -EHOSTUNREACH; hci_dev_lock(hdev); + lock_sock(sk); if (!bis_capable(hdev)) { err = -EOPNOTSUPP; @@ -399,13 +400,9 @@ static int iso_connect_bis(struct sock *sk) goto unlock; } - lock_sock(sk); - err = iso_chan_add(conn, sk, NULL); - if (err) { - release_sock(sk); + if (err) goto unlock; - } /* Update source addr of the socket */ bacpy(&iso_pi(sk)->src, &hcon->src); @@ -421,9 +418,8 @@ static int iso_connect_bis(struct sock *sk) iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo)); } - release_sock(sk); - unlock: + release_sock(sk); hci_dev_unlock(hdev); hci_dev_put(hdev); return err; @@ -444,6 +440,7 @@ static int iso_connect_cis(struct sock *sk) return -EHOSTUNREACH; hci_dev_lock(hdev); + lock_sock(sk); if (!cis_central_capable(hdev)) { err = -EOPNOTSUPP; @@ -498,13 +495,9 @@ static int iso_connect_cis(struct sock *sk) goto unlock; } - lock_sock(sk); - err = iso_chan_add(conn, sk, NULL); - if (err) { - release_sock(sk); + if (err) goto unlock; - } /* Update source addr of the socket */ bacpy(&iso_pi(sk)->src, &hcon->src); @@ -520,9 +513,8 @@ static int iso_connect_cis(struct sock *sk) iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo)); } - release_sock(sk); - unlock: + release_sock(sk); hci_dev_unlock(hdev); hci_dev_put(hdev); return err; @@ -2256,8 +2248,10 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sid, ev1); if (sk && !ev1->status) { + lock_sock(sk); iso_pi(sk)->sync_handle = le16_to_cpu(ev1->handle); iso_pi(sk)->bc_sid = ev1->sid; + release_sock(sk); } goto done; @@ -2268,8 +2262,10 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sid_past, ev1a); if (sk && !ev1a->status) { + lock_sock(sk); iso_pi(sk)->sync_handle = le16_to_cpu(ev1a->sync_handle); iso_pi(sk)->bc_sid = ev1a->sid; + release_sock(sk); } goto done; @@ -2296,27 +2292,35 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) ev2); if (sk) { - int err; - struct hci_conn *hcon = iso_pi(sk)->conn->hcon; + int err = 0; + bool big_sync; + struct hci_conn *hcon; + lock_sock(sk); + + hcon = iso_pi(sk)->conn->hcon; iso_pi(sk)->qos.bcast.encryption = ev2->encryption; if (ev2->num_bis < iso_pi(sk)->bc_num_bis) iso_pi(sk)->bc_num_bis = ev2->num_bis; - if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) && - !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { + big_sync = !test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) && + !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags); + + if (big_sync) err = hci_conn_big_create_sync(hdev, hcon, &iso_pi(sk)->qos, iso_pi(sk)->sync_handle, iso_pi(sk)->bc_num_bis, iso_pi(sk)->bc_bis); - if (err) { - bt_dev_err(hdev, "hci_le_big_create_sync: %d", - err); - sock_put(sk); - sk = NULL; - } + + release_sock(sk); + + if (big_sync && err) { + bt_dev_err(hdev, "hci_le_big_create_sync: %d", + err); + sock_put(sk); + sk = NULL; } } @@ -2370,8 +2374,10 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) if (!base || base_len > BASE_MAX_LENGTH) goto done; + lock_sock(sk); memcpy(iso_pi(sk)->base, base, base_len); iso_pi(sk)->base_len = base_len; + release_sock(sk); } else { /* This is a PA data fragment. Keep pa_data_len set to 0 * until all data has been reassembled. -- cgit v1.2.3 From 634a4408c0615c523cf7531790f4f14a422b9206 Mon Sep 17 00:00:00 2001 From: Tristan Madani Date: Tue, 21 Apr 2026 11:14:54 +0000 Subject: Bluetooth: btmtk: validate WMT event SKB length before struct access btmtk_usb_hci_wmt_sync() casts the WMT event response SKB data to struct btmtk_hci_wmt_evt (7 bytes) and struct btmtk_hci_wmt_evt_funcc (9 bytes) without first checking that the SKB contains enough data. A short firmware response causes out-of-bounds reads from SKB tailroom. Use skb_pull_data() to validate and advance past the base WMT event header. For the FUNC_CTRL case, pull the additional status field bytes before accessing them. Fixes: d019930b0049 ("Bluetooth: btmtk: move btusb_mtk_hci_wmt_sync to btmtk.c") Cc: stable@vger.kernel.org Signed-off-by: Tristan Madani Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btmtk.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/bluetooth/btmtk.c b/drivers/bluetooth/btmtk.c index 6fb6ca274808..f70c1b0f8990 100644 --- a/drivers/bluetooth/btmtk.c +++ b/drivers/bluetooth/btmtk.c @@ -695,8 +695,13 @@ static int btmtk_usb_hci_wmt_sync(struct hci_dev *hdev, if (data->evt_skb == NULL) goto err_free_wc; - /* Parse and handle the return WMT event */ - wmt_evt = (struct btmtk_hci_wmt_evt *)data->evt_skb->data; + wmt_evt = skb_pull_data(data->evt_skb, sizeof(*wmt_evt)); + if (!wmt_evt) { + bt_dev_err(hdev, "WMT event too short (%u bytes)", + data->evt_skb->len); + err = -EINVAL; + goto err_free_skb; + } if (wmt_evt->whdr.op != hdr->op) { bt_dev_err(hdev, "Wrong op received %d expected %d", wmt_evt->whdr.op, hdr->op); @@ -712,6 +717,12 @@ static int btmtk_usb_hci_wmt_sync(struct hci_dev *hdev, status = BTMTK_WMT_PATCH_DONE; break; case BTMTK_WMT_FUNC_CTRL: + if (!skb_pull_data(data->evt_skb, + sizeof(wmt_evt_funcc->status))) { + err = -EINVAL; + goto err_free_skb; + } + wmt_evt_funcc = (struct btmtk_hci_wmt_evt_funcc *)wmt_evt; if (be16_to_cpu(wmt_evt_funcc->status) == 0x404) status = BTMTK_WMT_ON_DONE; -- cgit v1.2.3 From 21bd244b6de5d2fe1063c23acc93fbdd2b20d112 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Tue, 21 Apr 2026 13:08:44 -0400 Subject: Bluetooth: virtio_bt: clamp rx length before skb_put virtbt_rx_work() calls skb_put(skb, len) where len comes directly from virtqueue_get_buf() with no validation against the buffer we posted to the device. The RX skb is allocated in virtbt_add_inbuf() and exposed to virtio as exactly 1000 bytes via sg_init_one(). Checking len against skb_tailroom(skb) is not sufficient because alloc_skb() can leave more tailroom than the 1000 bytes actually handed to the device. A malicious or buggy backend can therefore report used.len between 1001 and skb_tailroom(skb), causing skb_put() to include uninitialized kernel heap bytes that were never written by the device. The same path also accepts len == 0, in which case skb_put(skb, 0) leaves the skb empty but virtbt_rx_handle() still reads the pkt_type byte from skb->data, consuming uninitialized memory. Define VIRTBT_RX_BUF_SIZE once and reuse it in alloc_skb() and sg_init_one(), and gate virtbt_rx_work() on that same constant so the bound checked matches the buffer actually exposed to the device. Reject used.len == 0 in the same gate so an empty completion can no longer reach virtbt_rx_handle(). Use bt_dev_err_ratelimited() because the length value comes from an untrusted backend that can otherwise flood the kernel log. Same class of bug as commit c04db81cd028 ("net/9p: Fix buffer overflow in USB transport layer"), which hardened the USB 9p transport against unchecked device-reported length. Fixes: 160fbcf3bfb9 ("Bluetooth: virtio_bt: Use skb_put to set length") Cc: stable@vger.kernel.org Cc: Soenke Huster Signed-off-by: Michael Bommarito Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/virtio_bt.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/bluetooth/virtio_bt.c b/drivers/bluetooth/virtio_bt.c index 76d61af8a275..2c5c39356a1c 100644 --- a/drivers/bluetooth/virtio_bt.c +++ b/drivers/bluetooth/virtio_bt.c @@ -12,6 +12,7 @@ #include #define VERSION "0.1" +#define VIRTBT_RX_BUF_SIZE 1000 enum { VIRTBT_VQ_TX, @@ -33,11 +34,11 @@ static int virtbt_add_inbuf(struct virtio_bluetooth *vbt) struct sk_buff *skb; int err; - skb = alloc_skb(1000, GFP_KERNEL); + skb = alloc_skb(VIRTBT_RX_BUF_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; - sg_init_one(sg, skb->data, 1000); + sg_init_one(sg, skb->data, VIRTBT_RX_BUF_SIZE); err = virtqueue_add_inbuf(vq, sg, 1, skb, GFP_KERNEL); if (err < 0) { @@ -227,8 +228,15 @@ static void virtbt_rx_work(struct work_struct *work) if (!skb) return; - skb_put(skb, len); - virtbt_rx_handle(vbt, skb); + if (!len || len > VIRTBT_RX_BUF_SIZE) { + bt_dev_err_ratelimited(vbt->hdev, + "rx reply len %u outside [1, %u]\n", + len, VIRTBT_RX_BUF_SIZE); + kfree_skb(skb); + } else { + skb_put(skb, len); + virtbt_rx_handle(vbt, skb); + } if (virtbt_add_inbuf(vbt) < 0) return; -- cgit v1.2.3 From daf23014e5d975e72ea9c02b5160d3fcf070ea47 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Tue, 21 Apr 2026 13:08:45 -0400 Subject: Bluetooth: virtio_bt: validate rx pkt_type header length virtbt_rx_handle() reads the leading pkt_type byte from the RX skb and forwards the remainder to hci_recv_frame() for every event/ACL/SCO/ISO type, without checking that the remaining payload is at least the fixed HCI header for that type. After the preceding patch bounds the backend-supplied used.len to [1, VIRTBT_RX_BUF_SIZE], a one-byte completion still reaches hci_recv_frame() with skb->len already pulled to 0. If the byte happened to be HCI_ACLDATA_PKT, the ACL-vs-ISO classification fast-path in hci_dev_classify_pkt_type() dereferences hci_acl_hdr(skb)->handle whenever the HCI device has an active CIS_LINK, BIS_LINK, or PA_LINK connection, reading two bytes of uninitialized RX-buffer data. The same hazard exists for every packet type the driver accepts because none of the switch cases in virtbt_rx_handle() check skb->len against the per-type minimum HCI header size before handing the frame to the core. After stripping pkt_type, require skb->len to cover the fixed header size for the selected type (event 2, ACL 4, SCO 3, ISO 4) before calling hci_recv_frame(); drop ratelimited otherwise. Unknown pkt_type values still take the original kfree_skb() default path. Use bt_dev_err_ratelimited() because both the length and pkt_type values come from an untrusted backend that can otherwise flood the kernel log. Fixes: 160fbcf3bfb9 ("Bluetooth: virtio_bt: Use skb_put to set length") Cc: stable@vger.kernel.org Cc: Soenke Huster Signed-off-by: Michael Bommarito Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/virtio_bt.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/bluetooth/virtio_bt.c b/drivers/bluetooth/virtio_bt.c index 2c5c39356a1c..140ab55c9fc5 100644 --- a/drivers/bluetooth/virtio_bt.c +++ b/drivers/bluetooth/virtio_bt.c @@ -198,6 +198,7 @@ static int virtbt_shutdown_generic(struct hci_dev *hdev) static void virtbt_rx_handle(struct virtio_bluetooth *vbt, struct sk_buff *skb) { + size_t min_hdr; __u8 pkt_type; pkt_type = *((__u8 *) skb->data); @@ -205,16 +206,32 @@ static void virtbt_rx_handle(struct virtio_bluetooth *vbt, struct sk_buff *skb) switch (pkt_type) { case HCI_EVENT_PKT: + min_hdr = sizeof(struct hci_event_hdr); + break; case HCI_ACLDATA_PKT: + min_hdr = sizeof(struct hci_acl_hdr); + break; case HCI_SCODATA_PKT: + min_hdr = sizeof(struct hci_sco_hdr); + break; case HCI_ISODATA_PKT: - hci_skb_pkt_type(skb) = pkt_type; - hci_recv_frame(vbt->hdev, skb); + min_hdr = sizeof(struct hci_iso_hdr); break; default: kfree_skb(skb); - break; + return; } + + if (skb->len < min_hdr) { + bt_dev_err_ratelimited(vbt->hdev, + "rx pkt_type 0x%02x payload %u < hdr %zu\n", + pkt_type, skb->len, min_hdr); + kfree_skb(skb); + return; + } + + hci_skb_pkt_type(skb) = pkt_type; + hci_recv_frame(vbt->hdev, skb); } static void virtbt_rx_work(struct work_struct *work) -- cgit v1.2.3 From 8f59d17b18a78fdfdbb67d693b3d3eb03db184e0 Mon Sep 17 00:00:00 2001 From: Pengpeng Hou Date: Thu, 23 Apr 2026 23:31:00 +0800 Subject: Bluetooth: RFCOMM: pull credit byte with skb_pull_data() rfcomm_recv_data() treats the first payload byte as a credit field when the UIH frame carries PF and credit-based flow control is enabled. After the header has been stripped, the PF/CFC path consumes that byte with a direct skb->data dereference followed by skb_pull(). A malformed short frame can reach this path without a byte available. Use skb_pull_data() so the length check and pull happen together before the returned credit byte is consumed. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Pengpeng Hou Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/rfcomm/core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 611a9a94151e..d11bd5337d57 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -1715,9 +1715,12 @@ static int rfcomm_recv_data(struct rfcomm_session *s, u8 dlci, int pf, struct sk } if (pf && d->cfc) { - u8 credits = *(u8 *) skb->data; skb_pull(skb, 1); + u8 *credits = skb_pull_data(skb, 1); - d->tx_credits += credits; + if (!credits) + goto drop; + + d->tx_credits += *credits; if (d->tx_credits) clear_bit(RFCOMM_TX_THROTTLED, &d->flags); } -- cgit v1.2.3 From 72d97cae2a83cecf6f47208646675ecd066d0a3e Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 29 Apr 2026 15:40:46 +0200 Subject: Bluetooth: hci_event: fix memset typo hci_le_big_sync_established_evt() currently does: conn->num_bis = 0; memset(conn->bis, 0, sizeof(conn->num_bis)); sizeof(conn->num_bis) is wrong - it would make sense to either use conn->num_bis (before setting that to 0) or sizeof(conn->bis). Fix it by using sizeof(conn->bis), the least intrusive change. Luckily, nothing actually depends on this memset() working properly: Nothing seems to ever read from conn->bis beyond conn->num_bis, and when conn->num_bis is increased, the corresponding elements of conn->bis are initialized. So I think this line could also just be removed. This is a purely theoretical fix and should have no impact on actual behavior. Fixes: 42ecf1947135 ("Bluetooth: ISO: Do not emit LE BIG Create Sync if previous is pending") Signed-off-by: Jann Horn Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 1b3b9131affa..eea2f810aafa 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -7191,7 +7191,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, clear_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags); conn->num_bis = 0; - memset(conn->bis, 0, sizeof(conn->num_bis)); + memset(conn->bis, 0, sizeof(conn->bis)); for (i = 0; i < ev->num_bis; i++) { u16 handle = le16_to_cpu(ev->bis[i]); -- cgit v1.2.3 From c5d415596cb6fbdf6334b06cc87a1a5a268d8725 Mon Sep 17 00:00:00 2001 From: Michael Bommarito Date: Sat, 2 May 2026 12:43:03 -0400 Subject: Bluetooth: HIDP: serialise l2cap_unregister_user via hidp_session_sem Commit dbf666e4fc9b ("Bluetooth: HIDP: Fix possible UAF") made hidp_session_remove() drop the L2CAP reference and set session->conn = NULL once the session is considered removed, and added a bare if (session->conn) guard around the kthread-exit l2cap_unregister_user() call in hidp_session_thread(). The sibling ioctl site in hidp_connection_del() still reads session->conn unlocked and unguarded, and the kthread-exit guard itself is a lockless double-read. hidp_session_find() drops hidp_session_sem before returning, so hidp_session_remove() can null session->conn between the lookup and the call in hidp_connection_del(). Worse, since commit 752a6c9596dd ("Bluetooth: L2CAP: Fix use-after-free in l2cap_unregister_user") takes mutex_lock(&conn->lock) inside l2cap_unregister_user(), a stale non-NULL snapshot also UAFs on conn->lock. v1 only added an if (session->conn) guard at the ioctl site, which doesn't address either race; Luiz suggested snapshotting session->conn under the sem and clearing it before the call. Taking hidp_session_sem across l2cap_unregister_user() would be wrong: l2cap_conn_del() already establishes the lock order conn->lock -> hidp_session_sem via l2cap_unregister_all_users() -> user->remove == hidp_session_remove(), so taking hidp_session_sem before conn->lock would AB/BA deadlock. Factor a helper hidp_session_unregister_conn() that under down_write(&hidp_session_sem) snapshots session->conn and clears the member, then outside the sem calls l2cap_unregister_user() and l2cap_conn_put() on the snapshot. Call it from both hidp_connection_del() and hidp_session_thread()'s exit path. At most one consumer wins the write-sem; later callers observe session->conn == NULL and skip the unregister and put, so the reference hidp_session_new() took via l2cap_conn_get() is consumed exactly once. session_free() already tolerates a NULL session->conn. Fixes: dbf666e4fc9b ("Bluetooth: HIDP: Fix possible UAF") Suggested-by: Luiz Augusto von Dentz Link: https://lore.kernel.org/all/20260422011437.176643-1-michael.bommarito@gmail.com/ Signed-off-by: Michael Bommarito Assisted-by: Claude:claude-opus-4-7 Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hidp/core.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 7bcf8c5ceaee..976f91eeb745 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -1035,6 +1035,28 @@ static struct hidp_session *hidp_session_find(const bdaddr_t *bdaddr) return session; } +/* + * Consume session->conn: clear the member under hidp_session_sem, then + * l2cap_unregister_user() and l2cap_conn_put() the snapshot outside the + * sem. At most one caller wins; later callers see NULL and skip. The + * reference is the one hidp_session_new() took via l2cap_conn_get(). + */ +static void hidp_session_unregister_conn(struct hidp_session *session) +{ + struct l2cap_conn *conn; + + down_write(&hidp_session_sem); + conn = session->conn; + if (conn) + session->conn = NULL; + up_write(&hidp_session_sem); + + if (conn) { + l2cap_unregister_user(conn, &session->user); + l2cap_conn_put(conn); + } +} + /* * Start session synchronously * This starts a session thread and waits until initialization @@ -1311,8 +1333,7 @@ static int hidp_session_thread(void *arg) * Instead, this call has the same semantics as if user-space tried to * delete the session. */ - if (session->conn) - l2cap_unregister_user(session->conn, &session->user); + hidp_session_unregister_conn(session); hidp_session_put(session); @@ -1418,7 +1439,7 @@ int hidp_connection_del(struct hidp_conndel_req *req) HIDP_CTRL_VIRTUAL_CABLE_UNPLUG, NULL, 0); else - l2cap_unregister_user(session->conn, &session->user); + hidp_session_unregister_conn(session); hidp_session_put(session); -- cgit v1.2.3 From 0e1368a28dd5231ae0dbe240dfe0ff2657de5647 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 6 May 2026 17:22:05 -0700 Subject: selftests: drv-net: fix sort order of makefile and config Recent changes added configs and tests in the wrong spot. Link: https://lore.kernel.org/20260506170435.34984dfc@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/Makefile | 2 +- tools/testing/selftests/drivers/net/hw/config | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 3b6ff4708005..82809d5b2478 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -30,8 +30,8 @@ TEST_PROGS = \ gro_hw.py \ hw_stats_l3.sh \ hw_stats_l3_gre.sh \ - ipsec_vxlan.py \ iou-zcrx.py \ + ipsec_vxlan.py \ irq.py \ loopback.sh \ nic_timestamp.py \ diff --git a/tools/testing/selftests/drivers/net/hw/config b/tools/testing/selftests/drivers/net/hw/config index ae0168c2bbe6..8c132ace2b8d 100644 --- a/tools/testing/selftests/drivers/net/hw/config +++ b/tools/testing/selftests/drivers/net/hw/config @@ -3,6 +3,10 @@ CONFIG_FAIL_FUNCTION=y CONFIG_FAULT_INJECTION=y CONFIG_FAULT_INJECTION_DEBUG_FS=y CONFIG_FUNCTION_ERROR_INJECTION=y +CONFIG_INET6_ESP=y +CONFIG_INET6_ESP_OFFLOAD=y +CONFIG_INET_ESP=y +CONFIG_INET_ESP_OFFLOAD=y CONFIG_IO_URING=y CONFIG_IPV6=y CONFIG_IPV6_GRE=y @@ -12,10 +16,6 @@ CONFIG_NET_IPGRE=y CONFIG_NET_IPGRE_DEMUX=y CONFIG_NETKIT=y CONFIG_NET_SCH_INGRESS=y -CONFIG_INET6_ESP=y -CONFIG_INET6_ESP_OFFLOAD=y -CONFIG_INET_ESP=y -CONFIG_INET_ESP_OFFLOAD=y CONFIG_UDMABUF=y CONFIG_VXLAN=y CONFIG_XFRM_USER=y -- cgit v1.2.3 From 7aaa8f5e45a92678256c1e17f1fa2c2f45c61dd1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 May 2026 13:00:56 +0000 Subject: ipv6: fix potential UAF caused by ip6_forward_proxy_check() ip6_forward_proxy_check() calls pskb_may_pull() which might re-allocate skb->head. Reload ipv6_hdr() after the pskb_may_pull() call to avoid using the freed memory. Fixes: e21e0b5f19ac ("[IPV6] NDISC: Handle NDP messages to proxied addresses.") Reported-by: Damiano Melotti Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260505130056.2927197-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_output.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1f2a33fbed6e..c14adcdd4396 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -468,6 +468,7 @@ static int ip6_forward_proxy_check(struct sk_buff *skb) default: break; } + hdr = ipv6_hdr(skb); } /* @@ -582,6 +583,8 @@ int ip6_forward(struct sk_buff *skb) if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) { int proxied = ip6_forward_proxy_check(skb); + + hdr = ipv6_hdr(skb); if (proxied > 0) { /* It's tempting to decrease the hop limit * here by 1, as we do at the end of the -- cgit v1.2.3 From 7ce3f1bedaac88880594720ba0f687da3bd7fc8a Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Tue, 5 May 2026 03:42:23 -0700 Subject: netdevsim: psp: only call nsim_psp_uninit() on PFs VFs go through nsim_init_netdevsim_vf() which never calls nsim_psp_init(), so ns->psp.dev stays NULL. nsim_psp_uninit() guards with !IS_ERR(ns->psp.dev), so destroying a VF reaches psp_dev_unregister(NULL) and dereferences NULL on the first mutex_lock(&psd->lock): BUG: kernel NULL pointer dereference, address: 0000000000000020 RIP: 0010:mutex_lock+0x1c/0x30 Call Trace: psp_dev_unregister+0x2a/0x1a0 nsim_psp_uninit+0x1f/0x40 [netdevsim] nsim_destroy+0x61/0x1e0 [netdevsim] __nsim_dev_port_del+0x47/0x90 [netdevsim] nsim_drv_configure_vfs+0xc9/0x130 [netdevsim] nsim_bus_dev_numvfs_store+0x79/0xb0 [netdevsim] Gate nsim_psp_uninit() on nsim_dev_port_is_pf(), matching the pattern already used for nsim_exit_netdevsim() and the bpf/ipsec/macsec/queue teardowns. Reproducer: modprobe netdevsim echo "10 1" > /sys/bus/netdevsim/new_device echo 1 > /sys/bus/netdevsim/devices/netdevsim10/sriov_numvfs devlink dev eswitch set netdevsim/netdevsim10 mode switchdev echo 0 > /sys/bus/netdevsim/devices/netdevsim10/sriov_numvfs Fixes: f857478d6206 ("netdevsim: a basic test PSP implementation") Signed-off-by: Daniel Zahka Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260505-psd-rcu-v1-1-a8f69ec1ab96@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/netdev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index a05af192caf3..a750768912b5 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -1182,7 +1182,8 @@ void nsim_destroy(struct netdevsim *ns) unregister_netdevice_notifier_dev_net(ns->netdev, &ns->nb, &ns->nn); - nsim_psp_uninit(ns); + if (nsim_dev_port_is_pf(ns->nsim_dev_port)) + nsim_psp_uninit(ns); rtnl_lock(); peer = rtnl_dereference(ns->peer); -- cgit v1.2.3 From 24c96a42006ee27a078ec8c631c906dea8a3ca6d Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Tue, 5 May 2026 03:42:24 -0700 Subject: netdevsim: psp: serialize calls to nsim_psp_uninit() The debugfs write handler, nsim_psp_rereg_write(), can race against nsim_destroy() and against itself, causing nsim_psp_uninit() to run more than once concurrently. Two complementary changes serialize all callers: 1. Delete the psp_rereg debugfs file from nsim_psp_uninit() before doing the actual teardown. debugfs_remove() drains any in-flight writers and prevents new ones from starting. 2. Add a mutex around the body of nsim_psp_rereg_write() so that two concurrent userspace writers cannot both enter the teardown path at once. The teardown work itself is moved into a new __nsim_psp_uninit() that the rereg handler calls under the mutex, while the public nsim_psp_uninit() wraps it with the debugfs_remove()/mutex_destroy() pair so nsim_destroy() doesn't have to know about the psp internals. Fixes: f857478d6206 ("netdevsim: a basic test PSP implementation") Signed-off-by: Daniel Zahka Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260505-psd-rcu-v1-2-a8f69ec1ab96@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/netdevsim.h | 2 ++ drivers/net/netdevsim/psp.c | 17 ++++++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index 7e129dddbbe7..e373ffc26b0c 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -121,6 +121,8 @@ struct netdevsim { u64_stats_t tx_bytes; struct u64_stats_sync syncp; struct psp_dev *dev; + struct dentry *rereg; + struct mutex rereg_lock; u32 spi; u32 assoc_cnt; } psp; diff --git a/drivers/net/netdevsim/psp.c b/drivers/net/netdevsim/psp.c index 0b4d717253b0..86d84b7e566b 100644 --- a/drivers/net/netdevsim/psp.c +++ b/drivers/net/netdevsim/psp.c @@ -209,13 +209,20 @@ static struct psp_dev_caps nsim_psp_caps = { .assoc_drv_spc = sizeof(void *), }; -void nsim_psp_uninit(struct netdevsim *ns) +static void __nsim_psp_uninit(struct netdevsim *ns) { if (!IS_ERR(ns->psp.dev)) psp_dev_unregister(ns->psp.dev); WARN_ON(ns->psp.assoc_cnt); } +void nsim_psp_uninit(struct netdevsim *ns) +{ + debugfs_remove(ns->psp.rereg); + mutex_destroy(&ns->psp.rereg_lock); + __nsim_psp_uninit(ns); +} + static ssize_t nsim_psp_rereg_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) @@ -223,11 +230,13 @@ nsim_psp_rereg_write(struct file *file, const char __user *data, size_t count, struct netdevsim *ns = file->private_data; int err; - nsim_psp_uninit(ns); + mutex_lock(&ns->psp.rereg_lock); + __nsim_psp_uninit(ns); ns->psp.dev = psp_dev_create(ns->netdev, &nsim_psp_ops, &nsim_psp_caps, ns); err = PTR_ERR_OR_ZERO(ns->psp.dev); + mutex_unlock(&ns->psp.rereg_lock); return err ?: count; } @@ -249,6 +258,8 @@ int nsim_psp_init(struct netdevsim *ns) if (err) return err; - debugfs_create_file("psp_rereg", 0200, ddir, ns, &nsim_psp_rereg_fops); + mutex_init(&ns->psp.rereg_lock); + ns->psp.rereg = debugfs_create_file("psp_rereg", 0200, ddir, ns, + &nsim_psp_rereg_fops); return 0; } -- cgit v1.2.3 From 07bdec3fc737aac7f4c273aafa803d353174c43e Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Tue, 5 May 2026 03:42:25 -0700 Subject: netdevsim: psp: rcu protect psp_dev reference There are two issues with the way psp_dev is used in nsim_do_psp(): 1. There is no check for IS_ERR() on the peers psp_dev, before dereferencing. 2. The refcount on this psp_dev can be dropped by nsim_psp_rereg_write() To fix this, we can make netdevsim's reference to its psp_dev an rcu reference, and then nsim_do_psp() can read the fields it needs from an rcu critical section. Fixes: f857478d6206 ("netdevsim: a basic test PSP implementation") Signed-off-by: Daniel Zahka Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260505-psd-rcu-v1-3-a8f69ec1ab96@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/netdevsim.h | 2 +- drivers/net/netdevsim/psp.c | 54 +++++++++++++++++++++++++-------------- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index e373ffc26b0c..d909c4160ea1 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -120,7 +120,7 @@ struct netdevsim { u64_stats_t tx_packets; u64_stats_t tx_bytes; struct u64_stats_sync syncp; - struct psp_dev *dev; + struct psp_dev __rcu *dev; struct dentry *rereg; struct mutex rereg_lock; u32 spi; diff --git a/drivers/net/netdevsim/psp.c b/drivers/net/netdevsim/psp.c index 86d84b7e566b..6936ecb8173e 100644 --- a/drivers/net/netdevsim/psp.c +++ b/drivers/net/netdevsim/psp.c @@ -19,6 +19,7 @@ nsim_do_psp(struct sk_buff *skb, struct netdevsim *ns, struct netdevsim *peer_ns, struct skb_ext **psp_ext) { enum skb_drop_reason rc = 0; + struct psp_dev *peer_psd; struct psp_assoc *pas; struct net *net; void **ptr; @@ -48,7 +49,8 @@ nsim_do_psp(struct sk_buff *skb, struct netdevsim *ns, } /* Now pretend we just received this frame */ - if (peer_ns->psp.dev->config.versions & (1 << pas->version)) { + peer_psd = rcu_dereference(peer_ns->psp.dev); + if (peer_psd && peer_psd->config.versions & (1 << pas->version)) { bool strip_icv = false; u8 generation; @@ -61,8 +63,7 @@ nsim_do_psp(struct sk_buff *skb, struct netdevsim *ns, skb_ext_reset(skb); skb->mac_len = ETH_HLEN; - if (psp_dev_rcv(skb, peer_ns->psp.dev->id, generation, - strip_icv)) { + if (psp_dev_rcv(skb, peer_psd->id, generation, strip_icv)) { rc = SKB_DROP_REASON_PSP_OUTPUT; goto out_unlock; } @@ -209,10 +210,18 @@ static struct psp_dev_caps nsim_psp_caps = { .assoc_drv_spc = sizeof(void *), }; -static void __nsim_psp_uninit(struct netdevsim *ns) +static void __nsim_psp_uninit(struct netdevsim *ns, bool teardown) { - if (!IS_ERR(ns->psp.dev)) - psp_dev_unregister(ns->psp.dev); + struct psp_dev *psd; + + psd = rcu_dereference_protected(ns->psp.dev, + teardown || + lockdep_is_held(&ns->psp.rereg_lock)); + if (psd) { + rcu_assign_pointer(ns->psp.dev, NULL); + synchronize_rcu(); + psp_dev_unregister(psd); + } WARN_ON(ns->psp.assoc_cnt); } @@ -220,7 +229,7 @@ void nsim_psp_uninit(struct netdevsim *ns) { debugfs_remove(ns->psp.rereg); mutex_destroy(&ns->psp.rereg_lock); - __nsim_psp_uninit(ns); + __nsim_psp_uninit(ns, true); } static ssize_t @@ -228,16 +237,23 @@ nsim_psp_rereg_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) { struct netdevsim *ns = file->private_data; - int err; + struct psp_dev *psd; + ssize_t ret; mutex_lock(&ns->psp.rereg_lock); - __nsim_psp_uninit(ns); + __nsim_psp_uninit(ns, false); + + psd = psp_dev_create(ns->netdev, &nsim_psp_ops, &nsim_psp_caps, ns); + if (IS_ERR(psd)) { + ret = PTR_ERR(psd); + goto out; + } - ns->psp.dev = psp_dev_create(ns->netdev, &nsim_psp_ops, - &nsim_psp_caps, ns); - err = PTR_ERR_OR_ZERO(ns->psp.dev); + rcu_assign_pointer(ns->psp.dev, psd); + ret = count; +out: mutex_unlock(&ns->psp.rereg_lock); - return err ?: count; + return ret; } static const struct file_operations nsim_psp_rereg_fops = { @@ -250,13 +266,13 @@ static const struct file_operations nsim_psp_rereg_fops = { int nsim_psp_init(struct netdevsim *ns) { struct dentry *ddir = ns->nsim_dev_port->ddir; - int err; + struct psp_dev *psd; + + psd = psp_dev_create(ns->netdev, &nsim_psp_ops, &nsim_psp_caps, ns); + if (IS_ERR(psd)) + return PTR_ERR(psd); - ns->psp.dev = psp_dev_create(ns->netdev, &nsim_psp_ops, - &nsim_psp_caps, ns); - err = PTR_ERR_OR_ZERO(ns->psp.dev); - if (err) - return err; + rcu_assign_pointer(ns->psp.dev, psd); mutex_init(&ns->psp.rereg_lock); ns->psp.rereg = debugfs_create_file("psp_rereg", 0200, ddir, ns, -- cgit v1.2.3 From 701ea57feaabdea403cf299ee5cd0445083bc0ac Mon Sep 17 00:00:00 2001 From: Shitalkumar Gandhi Date: Tue, 5 May 2026 18:02:36 +0530 Subject: net: rtsn: fix mdio_node leak in rtsn_mdio_alloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit of_get_child_by_name() takes a reference. The rtsn_reset() and rtsn_change_mode() failure paths jump to out_free_bus and leak mdio_node. Add out_put_node to drop it before falling through. Fixes: b0d3969d2b4d ("net: ethernet: rtsn: Add support for Renesas Ethernet-TSN") Signed-off-by: Shitalkumar Gandhi Reviewed-by: Geert Uytterhoeven Reviewed-by: Andrew Lunn Reviewed-by: Niklas Söderlund Link: https://patch.msgid.link/20260505123236.406000-1-shitalkumar.gandhi@cambiumnetworks.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/rtsn.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/renesas/rtsn.c b/drivers/net/ethernet/renesas/rtsn.c index 03a2669f0518..ee8381b60b8d 100644 --- a/drivers/net/ethernet/renesas/rtsn.c +++ b/drivers/net/ethernet/renesas/rtsn.c @@ -797,11 +797,11 @@ static int rtsn_mdio_alloc(struct rtsn_private *priv) /* Enter config mode before registering the MDIO bus */ ret = rtsn_reset(priv); if (ret) - goto out_free_bus; + goto out_put_node; ret = rtsn_change_mode(priv, OCR_OPC_CONFIG); if (ret) - goto out_free_bus; + goto out_put_node; rtsn_modify(priv, MPIC, MPIC_PSMCS_MASK | MPIC_PSMHT_MASK, MPIC_PSMCS_DEFAULT | MPIC_PSMHT_DEFAULT); @@ -824,6 +824,8 @@ static int rtsn_mdio_alloc(struct rtsn_private *priv) return 0; +out_put_node: + of_node_put(mdio_node); out_free_bus: mdiobus_free(mii); return ret; -- cgit v1.2.3 From 67ef49047d312be692c8c439145f4514174e517f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 May 2026 13:32:33 +0000 Subject: inetpeer: add a missing read_seqretry() in inet_getpeer() When performing a lockless lookup over the inet_peer rbtree, if a matching node is found, inet_getpeer() returns it immediately without validating the seqlock sequence. This missing check introduces a race condition: Trigger Path: When a host receives an incoming fragmented IPv4 packet, ip4_frag_init() (in net/ipv4/ip_fragment.c) calls inet_getpeer_v4() to track the peer. The Race: If the packet is from a new source IP, CPU A acquires the write_seqlock, allocates a new inet_peer node (p), sets its IP address (daddr), and links it to the rbtree (rb_link_node). Uninitialized Access: Due to the lack of memory barriers between rb_link_node and the initialization of the rest of the struct (like refcount_set(&p->refcnt, 1)), CPU A can make the node visible to readers before its refcnt is initialized. This is especially true on weakly-ordered architectures like ARM64 where the CPU can reorder the memory stores. Lockless Reader: Concurrently, CPU B processes a second fragmented packet from the same source IP. CPU B does a lockless lookup, finds the newly inserted node, and returns it immediately. Use-After-Free (UAF): CPU B reads p->refcnt as uninitialized garbage (left over from previous kmalloc-128/192 allocations). If the garbage is > 0, refcount_inc_not_zero(&p->refcnt) succeeds. CPU A then executes refcount_set(&p->refcnt, 1), overwriting CPU B's increment. When CPU B finishes with the fragment queue, it calls inet_putpeer(), which drops the refcount to 0 and frees the node via RCU. The node is now freed but remains linked in the rbtree, resulting in a Use-After-Free in the rbtree. Fixes: b145425f269a ("inetpeer: remove AVL implementation in favor of RB tree") Reported-by: Damiano Melotti Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260505133233.3039575-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inetpeer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index d8083b9033c2..5b957a831e7c 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -179,7 +179,8 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, seq = read_seqbegin(&base->lock); p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp); - if (p) + /* Make sure tree was not modified during our lookup. */ + if (p && !read_seqretry(&base->lock, seq)) return p; /* retry an exact lookup, taking the lock before. -- cgit v1.2.3 From 770b136ff9bf3e319d19875da59c4f7f4853da3a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 May 2026 09:11:33 +0000 Subject: net/sched: sch_sfq: annotate data-races from sfq_dump_class_stats() sfq_dump_class_stats() runs locklessly, add needed READ_ONCE() and WRITE_ONCE() annotations. Fixes: edb09eb17ed8 ("net: sched: do not acquire qdisc spinlock in qdisc/class stats dump") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260505091133.2452510-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/sch_sfq.c | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index c3f3181dba54..f39822babf88 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -225,7 +225,8 @@ static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x) sfq_unlink(q, x, n, p); - d = q->slots[x].qlen--; + d = q->slots[x].qlen; + WRITE_ONCE(q->slots[x].qlen, d - 1); if (n == p && q->cur_depth == d) q->cur_depth--; sfq_link(q, x); @@ -238,7 +239,8 @@ static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x) sfq_unlink(q, x, n, p); - d = ++q->slots[x].qlen; + d = q->slots[x].qlen + 1; + WRITE_ONCE(q->slots[x].qlen, d); if (q->cur_depth < d) q->cur_depth = d; sfq_link(q, x); @@ -298,7 +300,7 @@ static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free) drop: skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot); len = qdisc_pkt_len(skb); - slot->backlog -= len; + WRITE_ONCE(slot->backlog, slot->backlog - len); sfq_dec(q, x); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); @@ -314,7 +316,7 @@ drop: q->tail = NULL; /* no more active slots */ else q->tail->next = slot->next; - q->ht[slot->hash] = SFQ_EMPTY_SLOT; + WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT); goto drop; } @@ -364,10 +366,10 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) x = q->dep[0].next; /* get a free slot */ if (x >= SFQ_MAX_FLOWS) return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_MAXFLOWS); - q->ht[hash] = x; + WRITE_ONCE(q->ht[hash], x); slot = &q->slots[x]; slot->hash = hash; - slot->backlog = 0; /* should already be 0 anyway... */ + WRITE_ONCE(slot->backlog, 0); /* should already be 0 anyway... */ red_set_vars(&slot->vars); goto enqueue; } @@ -426,7 +428,7 @@ congestion_drop: head = slot_dequeue_head(slot); delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb); sch->qstats.backlog -= delta; - slot->backlog -= delta; + WRITE_ONCE(slot->backlog, slot->backlog - delta); qdisc_drop_reason(head, sch, to_free, QDISC_DROP_FLOW_LIMIT); slot_queue_add(slot, skb); @@ -436,7 +438,7 @@ congestion_drop: enqueue: qdisc_qstats_backlog_inc(sch, skb); - slot->backlog += qdisc_pkt_len(skb); + WRITE_ONCE(slot->backlog, slot->backlog + qdisc_pkt_len(skb)); slot_queue_add(slot, skb); sfq_inc(q, x); if (slot->qlen == 1) { /* The flow is new */ @@ -452,7 +454,7 @@ enqueue: */ q->tail = slot; /* We could use a bigger initial quantum for new flows */ - slot->allot = q->quantum; + WRITE_ONCE(slot->allot, q->quantum); } if (++sch->q.qlen <= q->limit) return NET_XMIT_SUCCESS; @@ -489,7 +491,7 @@ next_slot: slot = &q->slots[a]; if (slot->allot <= 0) { q->tail = slot; - slot->allot += q->quantum; + WRITE_ONCE(slot->allot, slot->allot + q->quantum); goto next_slot; } skb = slot_dequeue_head(slot); @@ -497,10 +499,10 @@ next_slot: qdisc_bstats_update(sch, skb); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); - slot->backlog -= qdisc_pkt_len(skb); + WRITE_ONCE(slot->backlog, slot->backlog - qdisc_pkt_len(skb)); /* Is the slot empty? */ if (slot->qlen == 0) { - q->ht[slot->hash] = SFQ_EMPTY_SLOT; + WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT); next_a = slot->next; if (a == next_a) { q->tail = NULL; /* no more active slots */ @@ -508,7 +510,7 @@ next_slot: } q->tail->next = next_a; } else { - slot->allot -= qdisc_pkt_len(skb); + WRITE_ONCE(slot->allot, slot->allot - qdisc_pkt_len(skb)); } return skb; } @@ -549,9 +551,9 @@ static void sfq_rehash(struct Qdisc *sch) sfq_dec(q, i); __skb_queue_tail(&list, skb); } - slot->backlog = 0; + WRITE_ONCE(slot->backlog, 0); red_set_vars(&slot->vars); - q->ht[slot->hash] = SFQ_EMPTY_SLOT; + WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT); } q->tail = NULL; @@ -570,7 +572,7 @@ drop: dropped++; continue; } - q->ht[hash] = x; + WRITE_ONCE(q->ht[hash], x); slot = &q->slots[x]; slot->hash = hash; } @@ -581,7 +583,7 @@ drop: slot->vars.qavg = red_calc_qavg(q->red_parms, &slot->vars, slot->backlog); - slot->backlog += qdisc_pkt_len(skb); + WRITE_ONCE(slot->backlog, slot->backlog + qdisc_pkt_len(skb)); sfq_inc(q, x); if (slot->qlen == 1) { /* The flow is new */ if (q->tail == NULL) { /* It is the first flow */ @@ -591,7 +593,7 @@ drop: q->tail->next = x; } q->tail = slot; - slot->allot = q->quantum; + WRITE_ONCE(slot->allot, q->quantum); } } sch->q.qlen -= dropped; @@ -905,16 +907,16 @@ static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct sfq_sched_data *q = qdisc_priv(sch); - sfq_index idx = q->ht[cl - 1]; + sfq_index idx = READ_ONCE(q->ht[cl - 1]); struct gnet_stats_queue qs = { 0 }; struct tc_sfq_xstats xstats = { 0 }; if (idx != SFQ_EMPTY_SLOT) { const struct sfq_slot *slot = &q->slots[idx]; - xstats.allot = slot->allot; - qs.qlen = slot->qlen; - qs.backlog = slot->backlog; + xstats.allot = READ_ONCE(slot->allot); + qs.qlen = READ_ONCE(slot->qlen); + qs.backlog = READ_ONCE(slot->backlog); } if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) return -1; @@ -930,7 +932,7 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) return; for (i = 0; i < q->divisor; i++) { - if (q->ht[i] == SFQ_EMPTY_SLOT) { + if (READ_ONCE(q->ht[i]) == SFQ_EMPTY_SLOT) { arg->count++; continue; } -- cgit v1.2.3 From c8f7244c8cccaaed4e6c9fe4b8a07e101d0423e5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 May 2026 15:39:27 +0000 Subject: tcp: tcp_child_process() related UAF tcp_child_process( .. child ...) currently calls sock_put(child). Unfortunately @child (named @nsk in callers) can be used after this point to send a RST packet. To fix this UAF, I remove the sock_put() from tcp_child_process() and let the callers handle this after it is safe. Remove @rsk variable in tcp_v4_do_rcv() and change tcp_v6_do_rcv() so that both functions look the same. Fixes: cfb6eeb4c860 ("[TCP]: MD5 Signature Option (RFC2385) support.") Reported-by: Damiano Melotti Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260505153927.3435532-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ipv4.c | 14 ++++++-------- net/ipv4/tcp_minisocks.c | 2 +- net/ipv6/tcp_ipv6.c | 13 ++++++++----- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8fc24c3743c5..c0526cc03980 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1827,7 +1827,6 @@ INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason; - struct sock *rsk; reason = psp_sk_rx_policy_check(sk, skb); if (reason) @@ -1863,24 +1862,21 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) return 0; if (nsk != sk) { reason = tcp_child_process(sk, nsk, skb); - if (reason) { - rsk = nsk; + sock_put(nsk); + if (reason) goto reset; - } return 0; } } else sock_rps_save_rxhash(sk, skb); reason = tcp_rcv_state_process(sk, skb); - if (reason) { - rsk = sk; + if (reason) goto reset; - } return 0; reset: - tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason)); + tcp_v4_send_reset(sk, skb, sk_rst_convert_drop_reason(reason)); discard: sk_skb_reason_drop(sk, skb, reason); /* Be careful here. If this function gets more complicated and @@ -2193,8 +2189,10 @@ lookup: rst_reason = sk_rst_convert_drop_reason(drop_reason); tcp_v4_send_reset(nsk, skb, rst_reason); + sock_put(nsk); goto discard_and_relse; } + sock_put(nsk); sock_put(sk); return 0; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 199f0b579e89..e6092c3ac840 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -1012,6 +1012,6 @@ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, } bh_unlock_sock(child); - sock_put(child); + return reason; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2c3f7a739709..51583aef0643 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1617,12 +1617,13 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (sk->sk_state == TCP_LISTEN) { struct sock *nsk = tcp_v6_cookie_check(sk, skb); + if (!nsk) + return 0; if (nsk != sk) { - if (nsk) { - reason = tcp_child_process(sk, nsk, skb); - if (reason) - goto reset; - } + reason = tcp_child_process(sk, nsk, skb); + sock_put(nsk); + if (reason) + goto reset; return 0; } } else @@ -1827,8 +1828,10 @@ lookup: rst_reason = sk_rst_convert_drop_reason(drop_reason); tcp_v6_send_reset(nsk, skb, rst_reason); + sock_put(nsk); goto discard_and_relse; } + sock_put(nsk); sock_put(sk); return 0; } -- cgit v1.2.3 From b12014d2d36eaed4e4bec5f1ac7e91110eeb100d Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:49 +0200 Subject: mptcp: pm: kernel: correctly retransmit ADD_ADDR ID 0 When adding the ADD_ADDR to the list, the address including the IP, port and ID are copied. On the other hand, when the endpoint corresponds to the one from the initial subflow, the ID is set to 0, as specified by the MPTCP protocol. The issue is that the ID was reset after having copied the ID in the ADD_ADDR entry. So the retransmission was done, but using a different ID than the initial one. Fixes: 8b8ed1b429f8 ("mptcp: pm: reuse ID 0 after delete and re-add") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-1-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_kernel.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index c9f1e5af3cd3..fc818b63752e 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -347,6 +347,8 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) /* check first for announce */ if (msk->pm.add_addr_signaled < endp_signal_max) { + u8 endp_id; + /* due to racing events on both ends we can reach here while * previous add address is still running: if we invoke now * mptcp_pm_announce_addr(), that will fail and the @@ -360,19 +362,20 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) if (!select_signal_address(pernet, msk, &local)) goto subflow; + /* Special case for ID0: set the correct ID */ + endp_id = local.addr.id; + if (endp_id == msk->mpc_endpoint_id) + local.addr.id = 0; + /* If the alloc fails, we are on memory pressure, not worth * continuing, and trying to create subflows. */ if (!mptcp_pm_alloc_anno_list(msk, &local.addr)) return; - __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); + __clear_bit(endp_id, msk->pm.id_avail_bitmap); msk->pm.add_addr_signaled++; - /* Special case for ID0: set the correct ID */ - if (local.addr.id == msk->mpc_endpoint_id) - local.addr.id = 0; - mptcp_pm_announce_addr(msk, &local.addr, false); mptcp_pm_addr_send_ack(msk); -- cgit v1.2.3 From 03f324f3f1f7619a47b9c91282cb12775ab0a2f1 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:50 +0200 Subject: mptcp: pm: ADD_ADDR rtx: allow ID 0 ADD_ADDR can be sent for the ID 0, which corresponds to the local address and port linked to the initial subflow. Indeed, this address could be removed, and re-added later on, e.g. what is done in the "delete re-add signal" MPTCP Join selftests. So no reason to ignore it. Fixes: 00cfd77b9063 ("mptcp: retransmit ADD_ADDR when timeout") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-2-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 57a456690406..5056eb8db24e 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -337,9 +337,6 @@ static void mptcp_pm_add_timer(struct timer_list *timer) if (inet_sk_state_load(sk) == TCP_CLOSE) return; - if (!entry->addr.id) - return; - if (mptcp_pm_should_add_signal_addr(msk)) { sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8); goto out; -- cgit v1.2.3 From 5cd6e0ad79d2615264f63929f8b457ad97ae550d Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:51 +0200 Subject: mptcp: pm: ADD_ADDR rtx: fix potential data-race This mptcp_pm_add_timer() helper is executed as a timer callback in softirq context. To avoid any data races, the socket lock needs to be held with bh_lock_sock(). If the socket is in use, retry again soon after, similar to what is done with the keepalive timer. Fixes: 00cfd77b9063 ("mptcp: retransmit ADD_ADDR when timeout") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-3-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 5056eb8db24e..3912128d9b86 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -337,6 +337,13 @@ static void mptcp_pm_add_timer(struct timer_list *timer) if (inet_sk_state_load(sk) == TCP_CLOSE) return; + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + sk_reset_timer(sk, timer, jiffies + HZ / 20); + goto out; + } + if (mptcp_pm_should_add_signal_addr(msk)) { sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8); goto out; @@ -365,6 +372,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) mptcp_pm_subflow_established(msk); out: + bh_unlock_sock(sk); __sock_put(sk); } -- cgit v1.2.3 From 9634cb35af17019baec21ca648516ce376fa10e6 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:52 +0200 Subject: mptcp: pm: ADD_ADDR rtx: always decrease sk refcount When an ADD_ADDR is retransmitted, the sk is held in sk_reset_timer(). It should then be released in all cases at the end. Some (unlikely) checks were returning directly instead of calling sock_put() to decrease the refcount. Jump to a new 'exit' label to call __sock_put() (which will become sock_put() in the next commit) to fix this potential leak. While at it, drop the '!msk' check which cannot happen because it is never reset, and explicitly mark the remaining one as "unlikely". Fixes: 00cfd77b9063 ("mptcp: retransmit ADD_ADDR when timeout") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-4-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 3912128d9b86..2a01bf1b5bfd 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -331,11 +331,8 @@ static void mptcp_pm_add_timer(struct timer_list *timer) pr_debug("msk=%p\n", msk); - if (!msk) - return; - - if (inet_sk_state_load(sk) == TCP_CLOSE) - return; + if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) + goto exit; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { @@ -373,6 +370,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) out: bh_unlock_sock(sk); +exit: __sock_put(sk); } -- cgit v1.2.3 From b7b9a461569734d33d3259d58d2507adfac107ed Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:53 +0200 Subject: mptcp: pm: ADD_ADDR rtx: free sk if last When an ADD_ADDR is retransmitted, the sk is held in sk_reset_timer(), and released at the end. If at that moment, it was the last reference being held, the sk would not be freed. sock_put() should then be called instead of __sock_put(). But that's not enough: if it is the last reference, sock_put() will call sk_free(), which will end up calling sk_stop_timer_sync() on the same timer, and waiting indefinitely to finish. So it is needed to mark that the timer is done at the end of the timer handler when it has not been rescheduled, not to call sk_stop_timer_sync() on "itself". Fixes: 00cfd77b9063 ("mptcp: retransmit ADD_ADDR when timeout") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-5-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 2a01bf1b5bfd..8899327e59a1 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -16,6 +16,7 @@ struct mptcp_pm_add_entry { struct list_head list; struct mptcp_addr_info addr; u8 retrans_times; + bool timer_done; struct timer_list add_timer; struct mptcp_sock *sock; struct rcu_head rcu; @@ -327,22 +328,22 @@ static void mptcp_pm_add_timer(struct timer_list *timer) add_timer); struct mptcp_sock *msk = entry->sock; struct sock *sk = (struct sock *)msk; - unsigned int timeout; + unsigned int timeout = 0; pr_debug("msk=%p\n", msk); + bh_lock_sock(sk); if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) - goto exit; + goto out; - bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ - sk_reset_timer(sk, timer, jiffies + HZ / 20); + timeout = HZ / 20; goto out; } if (mptcp_pm_should_add_signal_addr(msk)) { - sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8); + timeout = TCP_RTO_MAX / 8; goto out; } @@ -360,8 +361,9 @@ static void mptcp_pm_add_timer(struct timer_list *timer) } if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) - sk_reset_timer(sk, timer, - jiffies + (timeout << entry->retrans_times)); + timeout <<= entry->retrans_times; + else + timeout = 0; spin_unlock_bh(&msk->pm.lock); @@ -369,9 +371,13 @@ static void mptcp_pm_add_timer(struct timer_list *timer) mptcp_pm_subflow_established(msk); out: + if (timeout) + sk_reset_timer(sk, timer, jiffies + timeout); + else + /* if sock_put calls sk_free: avoid waiting for this timer */ + entry->timer_done = true; bh_unlock_sock(sk); -exit: - __sock_put(sk); + sock_put(sk); } struct mptcp_pm_add_entry * @@ -434,6 +440,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); reset_timer: + add_entry->timer_done = false; timeout = mptcp_adjust_add_addr_timeout(msk); if (timeout) sk_reset_timer(sk, &add_entry->add_timer, jiffies + timeout); @@ -454,7 +461,8 @@ static void mptcp_pm_free_anno_list(struct mptcp_sock *msk) spin_unlock_bh(&msk->pm.lock); list_for_each_entry_safe(entry, tmp, &free_list, list) { - sk_stop_timer_sync(sk, &entry->add_timer); + if (!entry->timer_done) + sk_stop_timer_sync(sk, &entry->add_timer); kfree_rcu(entry, rcu); } } -- cgit v1.2.3 From 3cf12492891c4b5ff54dda404a2de4ec54c9e1b5 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:54 +0200 Subject: mptcp: pm: ADD_ADDR rtx: resched blocked ADD_ADDR quicker When an ADD_ADDR needs to be retransmitted and another one has already been prepared -- e.g. multiple ADD_ADDRs have been sent in a row and need to be retransmitted later -- this additional retransmission will need to wait. In this case, the timer was reset to TCP_RTO_MAX / 8, which is ~15 seconds. This delay is unnecessary long: it should just be rescheduled at the next opportunity, e.g. after the retransmission timeout. Without this modification, some issues can be seen from time to time in the selftests when multiple ADD_ADDRs are sent, and the host takes time to process them, e.g. the "signal addresses, ADD_ADDR timeout" MPTCP Join selftest, especially with a debug kernel config. Note that on older kernels, 'timeout' is not available. It should be enough to replace it by one second (HZ). Fixes: 00cfd77b9063 ("mptcp: retransmit ADD_ADDR when timeout") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-6-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 8899327e59a1..29d1bb6a69cf 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -342,13 +342,8 @@ static void mptcp_pm_add_timer(struct timer_list *timer) goto out; } - if (mptcp_pm_should_add_signal_addr(msk)) { - timeout = TCP_RTO_MAX / 8; - goto out; - } - timeout = mptcp_adjust_add_addr_timeout(msk); - if (!timeout) + if (!timeout || mptcp_pm_should_add_signal_addr(msk)) goto out; spin_lock_bh(&msk->pm.lock); -- cgit v1.2.3 From c6d395e2de1306b5fef0344a3c3835fbbfaa18be Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:55 +0200 Subject: mptcp: pm: ADD_ADDR rtx: skip inactive subflows When looking at the maximum RTO amongst the subflows, inactive subflows were taken into account: that includes stale ones, and the initial one if it has been already been closed. Unusable subflows are now simply skipped. Stale ones are used as an alternative: if there are only stale ones, to take their maximum RTO and avoid to eventually fallback to net.mptcp.add_addr_timeout, which is set to 2 minutes by default. Fixes: 30549eebc4d8 ("mptcp: make ADD_ADDR retransmission timeout adaptive") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-7-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 29d1bb6a69cf..8a5dba7fe66e 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -306,18 +306,28 @@ static unsigned int mptcp_adjust_add_addr_timeout(struct mptcp_sock *msk) const struct net *net = sock_net((struct sock *)msk); unsigned int rto = mptcp_get_add_addr_timeout(net); struct mptcp_subflow_context *subflow; - unsigned int max = 0; + unsigned int max = 0, max_stale = 0; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct inet_connection_sock *icsk = inet_csk(ssk); - if (icsk->icsk_rto > max) + if (!__mptcp_subflow_active(subflow)) + continue; + + if (unlikely(subflow->stale)) { + if (icsk->icsk_rto > max_stale) + max_stale = icsk->icsk_rto; + } else if (icsk->icsk_rto > max) { max = icsk->icsk_rto; + } } - if (max && max < rto) - rto = max; + if (max) + return min(max, rto); + + if (max_stale) + return min(max_stale, rto); return rto; } -- cgit v1.2.3 From 62a9b19dce77e72426f049fb99b9d1d032b9a8ea Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:56 +0200 Subject: mptcp: pm: ADD_ADDR rtx: return early if no retrans No need to iterate over all subflows if there is no retransmission needed. Exit early in this case then. Fixes: 30549eebc4d8 ("mptcp: make ADD_ADDR retransmission timeout adaptive") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-8-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 8a5dba7fe66e..4a6e5ab30d80 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -308,6 +308,9 @@ static unsigned int mptcp_adjust_add_addr_timeout(struct mptcp_sock *msk) struct mptcp_subflow_context *subflow; unsigned int max = 0, max_stale = 0; + if (!rto) + return 0; + mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct inet_connection_sock *icsk = inet_csk(ssk); -- cgit v1.2.3 From 166b78344031bf7ac9f55cb5282776cfd85f220e Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:57 +0200 Subject: mptcp: pm: prio: skip closed subflows When sending an MP_PRIO, closed subflows need to be skipped. This fixes the case where the initial subflow got closed, re-opened later, then an MP_PRIO is needed for the same local address. Note that explicit MP_PRIO cannot be sent during the 3WHS, so it is fine to use __mptcp_subflow_active(). Fixes: 067065422fcd ("mptcp: add the outgoing MP_PRIO support") Cc: stable@vger.kernel.org Fixes: b29fcfb54cd7 ("mptcp: full disconnect implementation") Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-9-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 4a6e5ab30d80..3c152bf66cd5 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -284,6 +284,9 @@ int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct mptcp_addr_info local, remote; + if (!__mptcp_subflow_active(subflow)) + continue; + mptcp_local_address((struct sock_common *)ssk, &local); if (!mptcp_addresses_equal(&local, addr, addr->port)) continue; -- cgit v1.2.3 From 65db7b27b90e2ea8d4966935aa9a50b6a60c31ac Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:58 +0200 Subject: selftests: mptcp: check output: catch cmd errors Using '${?}' inside the if-statement to check the returned value from the command that was evaluated as part of the if-statement is not correct: here, '${?}' will be linked to the previous instruction, not the one that is expected here (${cmd}). Instead, simply mark the error, except if an error is expected. If that's the case, 1 can be passed as the 4th argument of this helper. Three checks from pm_netlink.sh expect an error. While at it, improve the error message when the command unexpectedly fails or succeeds. Note that we could expect a specific returned value, but the checks currently expecting an error can be used with 'ip mptcp' or 'pm_nl_ctl', and these two tools don't return the same error code. Fixes: 2d0c1d27ea4e ("selftests: mptcp: add mptcp_lib_check_output helper") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-10-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_lib.sh | 16 ++++++++++------ tools/testing/selftests/net/mptcp/pm_netlink.sh | 10 ++++++---- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh index 5fea7e7df628..989a5975dcea 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_lib.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh @@ -474,20 +474,24 @@ mptcp_lib_wait_local_port_listen() { wait_local_port_listen "${@}" "tcp" } +# $1: error file, $2: cmd, $3: expected msg, [$4: expected error] mptcp_lib_check_output() { local err="${1}" local cmd="${2}" local expected="${3}" + local exp_error="${4:-0}" local cmd_ret=0 local out - if ! out=$(${cmd} 2>"${err}"); then - cmd_ret=${?} - fi + out=$(${cmd} 2>"${err}") || cmd_ret=1 - if [ ${cmd_ret} -ne 0 ]; then - mptcp_lib_pr_fail "command execution '${cmd}' stderr" - cat "${err}" + if [ "${cmd_ret}" != "${exp_error}" ]; then + mptcp_lib_pr_fail "unexpected returned code for '${cmd}', info:" + if [ "${exp_error}" = 0 ]; then + cat "${err}" + else + echo "${out}" + fi return 2 elif [ "${out}" = "${expected}" ]; then return 0 diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 123d9d7a0278..b69f30fcb91e 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -122,10 +122,12 @@ check() local cmd="$1" local expected="$2" local msg="$3" + local exp_error="$4" local rc=0 mptcp_lib_print_title "$msg" - mptcp_lib_check_output "${err}" "${cmd}" "${expected}" || rc=${?} + mptcp_lib_check_output "${err}" "${cmd}" "${expected}" "${exp_error}" || + rc=${?} if [ ${rc} -eq 2 ]; then mptcp_lib_result_fail "${msg} # error ${rc}" ret=${KSFT_FAIL} @@ -158,13 +160,13 @@ check "show_endpoints" \ "3,10.0.1.3,signal backup")" "dump addrs" del_endpoint 2 -check "get_endpoint 2" "" "simple del addr" +check "get_endpoint 2" "" "simple del addr" 1 check "show_endpoints" \ "$(format_endpoints "1,10.0.1.1" \ "3,10.0.1.3,signal backup")" "dump addrs after del" add_endpoint 10.0.1.3 2>/dev/null -check "get_endpoint 4" "" "duplicate addr" +check "get_endpoint 4" "" "duplicate addr" 1 add_endpoint 10.0.1.4 flags signal check "get_endpoint 4" "$(format_endpoints "4,10.0.1.4,signal")" "id addr increment" @@ -173,7 +175,7 @@ for i in $(seq 5 9); do add_endpoint "10.0.1.${i}" flags signal >/dev/null 2>&1 done check "get_endpoint 9" "$(format_endpoints "9,10.0.1.9,signal")" "hard addr limit" -check "get_endpoint 10" "" "above hard addr limit" +check "get_endpoint 10" "" "above hard addr limit" 1 del_endpoint 9 for i in $(seq 10 255); do -- cgit v1.2.3 From 53705ddfa18408f8e1f064331b6387509fa19f7f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 5 May 2026 17:00:59 +0200 Subject: selftests: mptcp: pm: restrict 'unknown' check to pm_nl_ctl When pm_netlink.sh is executed with '-i', 'ip mptcp' is used instead of 'pm_nl_ctl'. IPRoute2 doesn't support the 'unknown' flag, which has only been added to 'pm_nl_ctl' for this specific check: to ensure that the kernel ignores such unsupported flag. No reason to add this flag to 'ip mptcp'. Then, this check should be skipped when 'ip mptcp' is used. Fixes: 0cef6fcac24d ("selftests: mptcp: ip_mptcp option for more scripts") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20260505-net-mptcp-pm-fixes-7-1-rc3-v1-11-fca8091060a4@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/pm_netlink.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index b69f30fcb91e..04594dfc22b1 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -194,9 +194,13 @@ check "show_endpoints" \ flush_endpoint check "show_endpoints" "" "flush addrs" -add_endpoint 10.0.1.1 flags unknown -check "show_endpoints" "$(format_endpoints "1,10.0.1.1")" "ignore unknown flags" -flush_endpoint +# "unknown" flag is only supported by pm_nl_ctl +if ! mptcp_lib_is_ip_mptcp; then + add_endpoint 10.0.1.1 flags unknown + check "show_endpoints" "$(format_endpoints "1,10.0.1.1")" \ + "ignore unknown flags" + flush_endpoint +fi set_limits 9 1 2>/dev/null check "get_limits" "${default_limits}" "rcv addrs above hard limit" -- cgit v1.2.3 From b266bacba796ff5c4dcd2ae2fc08aacf7ab39153 Mon Sep 17 00:00:00 2001 From: Andreas Haarmann-Thiemann Date: Tue, 5 May 2026 23:52:17 +0200 Subject: net: ethernet: cortina: Drop half-assembled SKB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In gmac_rx() (drivers/net/ethernet/cortina/gemini.c), when gmac_get_queue_page() returns NULL for the second page of a multi-page fragment, the driver logs an error and continues — but does not free the partially assembled skb that was being assembled via napi_build_skb() / napi_get_frags(). Free the in-progress partially assembled skb via napi_free_frags() and increase the number of dropped frames appropriately and assign the skb pointer NULL to make sure it is not lingering around, matching the pattern already used elsewhere in the driver. Fixes: 4d5ae32f5e1e ("net: ethernet: Add a driver for Gemini gigabit ethernet") Signed-off-by: Andreas Haarmann-Thiemann Signed-off-by: Linus Walleij Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/20260505-gemini-ethernet-fix-v2-1-997c31d06079@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cortina/gemini.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 4824232f4890..065cbbf52686 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -1491,6 +1491,11 @@ static unsigned int gmac_rx(struct net_device *netdev, unsigned int budget) gpage = gmac_get_queue_page(geth, port, mapping + PAGE_SIZE); if (!gpage) { dev_err(geth->dev, "could not find mapping\n"); + if (skb) { + napi_free_frags(&port->napi); + port->stats.rx_dropped++; + skb = NULL; + } continue; } page = gpage->page; -- cgit v1.2.3 From 593dfd40a94ca0ab20297ea4629d94268deed0ed Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Mon, 4 May 2026 18:42:11 -0700 Subject: eth: fbnic: fix double-free of PCS on phylink creation failure fbnic_phylink_create() stores the newly allocated PCS in fbn->pcs and then calls phylink_create(). When phylink_create() fails, the error path correctly destroys the PCS via xpcs_destroy_pcs(), but the caller, fbnic_netdev_alloc(), responds by invoking fbnic_netdev_free() which calls fbnic_phylink_destroy(). That function finds fbn->pcs non-NULL and calls xpcs_destroy_pcs() a second time on the already-freed object, triggering a refcount underflow use-after-free: [ 1.934973] fbnic 0000:01:00.0: Failed to create Phylink interface, err: -22 [ 1.935103] ------------[ cut here ]------------ [ 1.935179] refcount_t: underflow; use-after-free. [ 1.935252] WARNING: lib/refcount.c:28 at refcount_warn_saturate+0x59/0x90, CPU#0: swapper/0/1 [ 1.935389] Modules linked in: [ 1.935484] CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 7.0.0-virtme-04244-g1f5ffc672165-dirty #1 PREEMPT(lazy) [ 1.935661] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [ 1.935826] RIP: 0010:refcount_warn_saturate+0x59/0x90 [ 1.935931] Code: 44 48 8d 3d 49 f9 a7 01 67 48 0f b9 3a e9 bf 1e 96 00 48 8d 3d 48 f9 a7 01 67 48 0f b9 3a c3 cc cc cc cc 48 8d 3d 47 f9 a7 01 <67> 48 0f b9 3a c3 cc cc cc cc 48 8d 3d 46 f9 a7 01 67 48 0f b9 3a [ 1.936274] RSP: 0000:ffffd0d440013c58 EFLAGS: 00010246 [ 1.936376] RAX: 0000000000000000 RBX: ffff8f39c188c278 RCX: 000000000000002b [ 1.936524] RDX: ffff8f39c004f000 RSI: 0000000000000003 RDI: ffffffff96abab00 [ 1.936692] RBP: ffff8f39c188c240 R08: ffffffff96988e88 R09: 00000000ffffdfff [ 1.936835] R10: ffffffff96878ea0 R11: 0000000000000187 R12: 0000000000000000 [ 1.936970] R13: ffff8f39c0cef0c8 R14: ffff8f39c1ac01c0 R15: 0000000000000000 [ 1.937114] FS: 0000000000000000(0000) GS:ffff8f3ba08b4000(0000) knlGS:0000000000000000 [ 1.937273] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1.937382] CR2: ffff8f3b3ffff000 CR3: 0000000172642001 CR4: 0000000000372ef0 [ 1.937540] Call Trace: [ 1.937619] [ 1.937698] xpcs_destroy_pcs+0x25/0x40 [ 1.937783] fbnic_netdev_alloc+0x1e5/0x200 [ 1.937859] fbnic_probe+0x230/0x370 [ 1.937939] local_pci_probe+0x3e/0x90 [ 1.938013] pci_device_probe+0xbb/0x1e0 [ 1.938091] ? sysfs_do_create_link_sd+0x6d/0xe0 [ 1.938188] really_probe+0xc1/0x2b0 [ 1.938282] __driver_probe_device+0x73/0x120 [ 1.938371] driver_probe_device+0x1e/0xe0 [ 1.938466] __driver_attach+0x8d/0x190 [ 1.938560] ? __pfx___driver_attach+0x10/0x10 [ 1.938663] bus_for_each_dev+0x7b/0xd0 [ 1.938758] bus_add_driver+0xe8/0x210 [ 1.938854] driver_register+0x60/0x120 [ 1.938929] ? __pfx_fbnic_init_module+0x10/0x10 [ 1.939026] fbnic_init_module+0x25/0x60 [ 1.939109] do_one_initcall+0x49/0x220 [ 1.939202] ? rdinit_setup+0x20/0x40 [ 1.939304] kernel_init_freeable+0x1b0/0x310 [ 1.939449] ? __pfx_kernel_init+0x10/0x10 [ 1.939560] kernel_init+0x1a/0x1c0 [ 1.939640] ret_from_fork+0x1ed/0x240 [ 1.939730] ? __pfx_kernel_init+0x10/0x10 [ 1.939805] ret_from_fork_asm+0x1a/0x30 [ 1.939886] [ 1.939927] ---[ end trace 0000000000000000 ]--- [ 1.940184] fbnic 0000:01:00.0: Netdev allocation failed Instead of calling fbnic_phylink_destroy(), the prior initialization of netdev should just be unrolled with free_netdev() and clearing fbd->netdev. Clearing fbd->netdev to NULL avoids UAF in init_failure_mode where callers guard by checking !fbd->netdev, such as fbnic_mdio_read_pmd(). These callers remain active even after a failed probe, so fdb->netdev still needs to be cleared. Fixes: d0fe7104c795 ("fbnic: Replace use of internal PCS w/ Designware XPCS") Signed-off-by: Bobby Eshleman Link: https://patch.msgid.link/20260504-fbnic-pcs-fix-v2-1-de45192821d9@meta.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/meta/fbnic/fbnic_netdev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c index c406a3b56b37..4dea2bb58d2f 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c @@ -826,7 +826,8 @@ struct net_device *fbnic_netdev_alloc(struct fbnic_dev *fbd) netif_tx_stop_all_queues(netdev); if (fbnic_phylink_create(netdev)) { - fbnic_netdev_free(fbd); + free_netdev(netdev); + fbd->netdev = NULL; return NULL; } -- cgit v1.2.3 From 08f566e8f83bb70f04ad5aba5be352c490a01c8a Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 5 May 2026 15:21:53 +0200 Subject: veth: fix OOB txq access in veth_poll() with asymmetric queue counts XDP redirect into a veth device (via bpf_redirect()) calls veth_xdp_xmit(), which enqueues frames into the peer's ptr_ring using smp_processor_id() % peer->real_num_rx_queues as the ring index. With an asymmetric veth pair where the peer has fewer TX queues than RX queues, that index can exceed peer->real_num_tx_queues. veth_poll() then resolves peer_txq for the ring via: peer_txq = peer_dev ? netdev_get_tx_queue(peer_dev, queue_idx) : NULL; where queue_idx = rq->xdp_rxq.queue_index. When queue_idx exceeds peer_dev->real_num_tx_queues this is an out-of-bounds (OOB) access into the peer's netdev_queue array, triggering DEBUG_NET_WARN_ON_ONCE in netdev_get_tx_queue(). The normal ndo_start_xmit path is not affected: the stack clamps skb->queue_mapping via netdev_cap_txqueue() before invoking ndo_start_xmit, so rxq in veth_xmit() never exceeds real_num_tx_queues. Fix veth_poll() by clamping: only dereference peer_txq when queue_idx is within bounds, otherwise set it to NULL. The out-of-range rings are fed exclusively via XDP redirect (veth_xdp_xmit), never via ndo_start_xmit (veth_xmit), so the peer txq was never stopped and there is nothing to wake; NULL is the correct fallback. Reported-by: Sashiko Closes: https://lore.kernel.org/all/20260502071828.616C3C19425@smtp.kernel.org/ Fixes: dc82a33297fc ("veth: apply qdisc backpressure on full ptr_ring to reduce TX drops") Signed-off-by: Jesper Dangaard Brouer Link: https://patch.msgid.link/20260505132159.241305-2-hawk@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/veth.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index e35df717e65e..0cfb19b760dd 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -972,7 +972,8 @@ static int veth_poll(struct napi_struct *napi, int budget) /* NAPI functions as RCU section */ peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held()); - peer_txq = peer_dev ? netdev_get_tx_queue(peer_dev, queue_idx) : NULL; + peer_txq = (peer_dev && queue_idx < peer_dev->real_num_tx_queues) ? + netdev_get_tx_queue(peer_dev, queue_idx) : NULL; xdp_set_return_frame_no_direct(); done = veth_xdp_rcv(rq, budget, &bq, &stats); -- cgit v1.2.3 From d119775f2bad827edc28071c061fdd4a91f889a5 Mon Sep 17 00:00:00 2001 From: Jiexun Wang Date: Wed, 6 May 2026 22:08:23 +0800 Subject: af_unix: Reject SIOCATMARK on non-stream sockets SIOCATMARK reports whether the receive queue is at the urgent mark for MSG_OOB. In AF_UNIX, MSG_OOB is supported only for SOCK_STREAM sockets. SOCK_DGRAM and SOCK_SEQPACKET reject MSG_OOB in sendmsg() and recvmsg(), so they should not support SIOCATMARK either. Return -EOPNOTSUPP for non-stream sockets before checking the receive queue. Fixes: 314001f0bf92 ("af_unix: Add OOB support") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Suggested-by: Kuniyuki Iwashima Signed-off-by: Jiexun Wang Signed-off-by: Ren Wei Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260506140825.2987635-1-n05ec@lzu.edu.cn Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e2d787ca3e74..1cbf36ea043b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3323,6 +3323,9 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) struct sk_buff *skb; int answ = 0; + if (sk->sk_type != SOCK_STREAM) + return -EOPNOTSUPP; + mutex_lock(&u->iolock); skb = skb_peek(&sk->sk_receive_queue); -- cgit v1.2.3 From 9032f7676935a13fd402608223d326c5f62da9c0 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Wed, 6 May 2026 09:41:05 +0800 Subject: net/smc: fix missing sk_err when TCP handshake fails In smc_connect_work(), when the underlying TCP handshake fails, the error code (rc) must be propagated to sk_err to ensure userspace can correctly retrieve the error status via SO_ERROR. Currently, the code only handles a restricted set of error codes (e.g., EPIPE, ECONNREFUSED). If other errors occurs, such as EHOSTUNREACH, sk_err remains unset (zero). This affects applications that rely on SO_ERROR to determine connect outcome. For example, higher versions of Go's netpoller treats SO_ERROR == 0 combined with a failed getpeername() as a spurious wakeup and re-enters epoll_wait(). Under ET mode, no further edge will be generated since the socket is already in a terminal state, causing the connect to hang indefinitely or until a user-specified timeout, if one is set. Fixes: 50717a37db03 ("net/smc: nonblocking connect rework") Signed-off-by: D. Wythe Reviewed-by: Dust Li Link: https://patch.msgid.link/20260506014105.27093-1-alibuda@linux.alibaba.com Signed-off-by: Jakub Kicinski --- net/smc/af_smc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 1a565095376a..185dbed7de5d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1628,12 +1628,8 @@ static void smc_connect_work(struct work_struct *work) lock_sock(&smc->sk); if (rc != 0 || smc->sk.sk_err) { smc->sk.sk_state = SMC_CLOSED; - if (rc == -EPIPE || rc == -EAGAIN) - smc->sk.sk_err = EPIPE; - else if (rc == -ECONNREFUSED) - smc->sk.sk_err = ECONNREFUSED; - else if (signal_pending(current)) - smc->sk.sk_err = -sock_intr_errno(timeo); + if (!smc->sk.sk_err) + smc->sk.sk_err = (rc == -EAGAIN) ? EPIPE : -rc; sock_put(&smc->sk); /* passive closing */ goto out; } -- cgit v1.2.3 From 32cd651d14fc72a93703ea2384cb5cd8998523a8 Mon Sep 17 00:00:00 2001 From: Justin Chen Date: Tue, 5 May 2026 10:39:26 -0700 Subject: net: phy: broadcom: Save PHY counters during suspend The PHY counters can be lost if the PHY is reset during suspend. We need to save the values into the shadow counters or the accounting will be incorrect over multiple suspend and resume cycles. Fixes: 820ee17b8d3b ("net: phy: broadcom: Add support code for reading PHY counters") Signed-off-by: Justin Chen Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20260505173926.2870069-1-justin.chen@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/bcm-phy-lib.c | 9 +++++++++ drivers/net/phy/bcm-phy-lib.h | 1 + drivers/net/phy/bcm7xxx.c | 14 ++++++++++++++ drivers/net/phy/broadcom.c | 5 +++++ 4 files changed, 29 insertions(+) diff --git a/drivers/net/phy/bcm-phy-lib.c b/drivers/net/phy/bcm-phy-lib.c index 5198d66dbbc0..b64beade8dd9 100644 --- a/drivers/net/phy/bcm-phy-lib.c +++ b/drivers/net/phy/bcm-phy-lib.c @@ -563,6 +563,15 @@ void bcm_phy_get_stats(struct phy_device *phydev, u64 *shadow, } EXPORT_SYMBOL_GPL(bcm_phy_get_stats); +void bcm_phy_update_stats_shadow(struct phy_device *phydev, u64 *shadow) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(bcm_phy_hw_stats); i++) + bcm_phy_get_stat(phydev, shadow, i); +} +EXPORT_SYMBOL_GPL(bcm_phy_update_stats_shadow); + void bcm_phy_r_rc_cal_reset(struct phy_device *phydev) { /* Reset R_CAL/RC_CAL Engine */ diff --git a/drivers/net/phy/bcm-phy-lib.h b/drivers/net/phy/bcm-phy-lib.h index bceddbc860eb..bba94ce96195 100644 --- a/drivers/net/phy/bcm-phy-lib.h +++ b/drivers/net/phy/bcm-phy-lib.h @@ -85,6 +85,7 @@ int bcm_phy_get_sset_count(struct phy_device *phydev); void bcm_phy_get_strings(struct phy_device *phydev, u8 *data); void bcm_phy_get_stats(struct phy_device *phydev, u64 *shadow, struct ethtool_stats *stats, u64 *data); +void bcm_phy_update_stats_shadow(struct phy_device *phydev, u64 *shadow); void bcm_phy_r_rc_cal_reset(struct phy_device *phydev); int bcm_phy_28nm_a0b0_afe_config_init(struct phy_device *phydev); int bcm_phy_enable_jumbo(struct phy_device *phydev); diff --git a/drivers/net/phy/bcm7xxx.c b/drivers/net/phy/bcm7xxx.c index 00e8fa14aa77..71a163f62c0e 100644 --- a/drivers/net/phy/bcm7xxx.c +++ b/drivers/net/phy/bcm7xxx.c @@ -807,6 +807,17 @@ static void bcm7xxx_28nm_get_phy_stats(struct phy_device *phydev, bcm_phy_get_stats(phydev, priv->stats, stats, data); } +static int bcm7xxx_28nm_suspend(struct phy_device *phydev) +{ + struct bcm7xxx_phy_priv *priv = phydev->priv; + + mutex_lock(&phydev->lock); + bcm_phy_update_stats_shadow(phydev, priv->stats); + mutex_unlock(&phydev->lock); + + return genphy_suspend(phydev); +} + static int bcm7xxx_28nm_probe(struct phy_device *phydev) { struct bcm7xxx_phy_priv *priv; @@ -849,6 +860,7 @@ static int bcm7xxx_28nm_probe(struct phy_device *phydev) .flags = PHY_IS_INTERNAL, \ .config_init = bcm7xxx_28nm_config_init, \ .resume = bcm7xxx_28nm_resume, \ + .suspend = bcm7xxx_28nm_suspend, \ .get_tunable = bcm7xxx_28nm_get_tunable, \ .set_tunable = bcm7xxx_28nm_set_tunable, \ .get_sset_count = bcm_phy_get_sset_count, \ @@ -866,6 +878,7 @@ static int bcm7xxx_28nm_probe(struct phy_device *phydev) .flags = PHY_IS_INTERNAL, \ .config_init = bcm7xxx_28nm_ephy_config_init, \ .resume = bcm7xxx_28nm_ephy_resume, \ + .suspend = bcm7xxx_28nm_suspend, \ .get_sset_count = bcm_phy_get_sset_count, \ .get_strings = bcm_phy_get_strings, \ .get_stats = bcm7xxx_28nm_get_phy_stats, \ @@ -902,6 +915,7 @@ static int bcm7xxx_28nm_probe(struct phy_device *phydev) .config_aneg = genphy_config_aneg, \ .read_status = genphy_read_status, \ .resume = bcm7xxx_16nm_ephy_resume, \ + .suspend = bcm7xxx_28nm_suspend, \ } static struct phy_driver bcm7xxx_driver[] = { diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index bf0c6a04481e..d1a4edb34ad2 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -592,8 +592,13 @@ static int bcm54xx_set_wakeup_irq(struct phy_device *phydev, bool state) static int bcm54xx_suspend(struct phy_device *phydev) { + struct bcm54xx_phy_priv *priv = phydev->priv; int ret = 0; + mutex_lock(&phydev->lock); + bcm_phy_update_stats_shadow(phydev, priv->stats); + mutex_unlock(&phydev->lock); + bcm54xx_ptp_stop(phydev); /* Acknowledge any Wake-on-LAN interrupt prior to suspend */ -- cgit v1.2.3 From 019c892e46544af0ae94ec833f79aa903c837666 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 6 May 2026 06:59:53 +0000 Subject: ipmr: Call ipmr_fib_lookup() under RCU. Yi Lai reported RCU splat in reg_vif_xmit() below. [0] When CONFIG_IP_MROUTE_MULTIPLE_TABLES=n, ipmr_fib_lookup() uses rcu_dereference() without explicit rcu_read_lock(). Although rcu_read_lock_bh() is already held by the caller __dev_queue_xmit(), lockdep requires explicit rcu_read_lock() for rcu_dereference(). Let's move up rcu_read_lock() in reg_vif_xmit() to cover ipmr_fib_lookup(). [0]: WARNING: suspicious RCU usage 7.1.0-rc2-next-20260504-9d0d467c3572 #1 Not tainted ----------------------------- net/ipv4/ipmr.c:329 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 2 locks held by syz.2.17/1779: #0: ffffffff87896440 (rcu_read_lock_bh){....}-{1:3}, at: local_bh_disable include/linux/bottom_half.h:20 [inline] #0: ffffffff87896440 (rcu_read_lock_bh){....}-{1:3}, at: rcu_read_lock_bh include/linux/rcupdate.h:891 [inline] #0: ffffffff87896440 (rcu_read_lock_bh){....}-{1:3}, at: __dev_queue_xmit+0x239/0x4140 net/core/dev.c:4792 #1: ffff88801a199d18 (_xmit_PIMREG#2){+...}-{3:3}, at: spin_lock include/linux/spinlock.h:342 [inline] #1: ffff88801a199d18 (_xmit_PIMREG#2){+...}-{3:3}, at: __netif_tx_lock include/linux/netdevice.h:4795 [inline] #1: ffff88801a199d18 (_xmit_PIMREG#2){+...}-{3:3}, at: __dev_queue_xmit+0x1d5d/0x4140 net/core/dev.c:4865 stack backtrace: CPU: 1 UID: 0 PID: 1779 Comm: syz.2.17 Not tainted 7.1.0-rc2-next-20260504-9d0d467c3572 #1 PREEMPT(lazy) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x121/0x150 lib/dump_stack.c:120 dump_stack+0x19/0x20 lib/dump_stack.c:129 lockdep_rcu_suspicious+0x15b/0x1f0 kernel/locking/lockdep.c:6878 ipmr_fib_lookup net/ipv4/ipmr.c:329 [inline] reg_vif_xmit+0x2ee/0x3c0 net/ipv4/ipmr.c:540 __netdev_start_xmit include/linux/netdevice.h:5382 [inline] netdev_start_xmit include/linux/netdevice.h:5391 [inline] xmit_one net/core/dev.c:3889 [inline] dev_hard_start_xmit+0x170/0x700 net/core/dev.c:3905 __dev_queue_xmit+0x1df1/0x4140 net/core/dev.c:4871 dev_queue_xmit include/linux/netdevice.h:3423 [inline] packet_xmit+0x252/0x370 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3082 [inline] packet_sendmsg+0x39ad/0x5650 net/packet/af_packet.c:3114 sock_sendmsg_nosec net/socket.c:797 [inline] __sock_sendmsg net/socket.c:812 [inline] ____sys_sendmsg+0xa21/0xba0 net/socket.c:2716 ___sys_sendmsg+0x121/0x1c0 net/socket.c:2770 __sys_sendmsg+0x177/0x220 net/socket.c:2802 __do_sys_sendmsg net/socket.c:2807 [inline] __se_sys_sendmsg net/socket.c:2805 [inline] __x64_sys_sendmsg+0x80/0xc0 net/socket.c:2805 x64_sys_call+0x1d9c/0x21c0 arch/x86/include/generated/asm/syscalls_64.h:47 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xc1/0x1020 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f37e563ee5d Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 93 af 1b 00 f7 d8 64 89 01 48 RSP: 002b:00007ffe5caa7fa8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00000000005c5fa0 RCX: 00007f37e563ee5d RDX: 0000000000000000 RSI: 00002000000012c0 RDI: 0000000000000004 RBP: 00000000005c5fa0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 00000000005c5fac R15: 00000000005c5fa0 Fixes: b3b6babf4751 ("ipmr: Free mr_table after RCU grace period.") Reported-by: syzkaller Reported-by: Yi Lai Closes: https://lore.kernel.org/netdev/afrY34dLXNUboevf@ly-workstation/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260506065955.1695753-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 05fb6eefe0be..2628cd3a93a6 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -537,15 +537,16 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) }; int err; + rcu_read_lock(); err = ipmr_fib_lookup(net, &fl4, &mrt); if (err < 0) { + rcu_read_unlock(); kfree_skb(skb); return err; } DEV_STATS_ADD(dev, tx_bytes, skb->len); DEV_STATS_INC(dev, tx_packets); - rcu_read_lock(); /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */ ipmr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num), -- cgit v1.2.3 From ecddc523cfdb85b3e132f13e293224ebfdfab564 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 6 May 2026 07:04:42 +0000 Subject: tcp: Fix dst leak in tcp_v6_connect(). If a socket is bound to a wildcard address, tcp_v[46]_connect() updates it with a non-wildcard address based on the route lookup. After bhash2 was introduced in the cited commit, we must call inet_bhash2_update_saddr() to update the bhash2 entry as well. If inet_bhash2_update_saddr() fails, we must release the refcount for dst by ip_route_connect() or ip6_dst_lookup_flow(). While tcp_v4_connect() calls ip_rt_put() in the error path, tcp_v6_connect() does not call dst_release(). Let's call dst_release() when inet_bhash2_update_saddr() fails in tcp_v6_connect(). Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address") Reported-by: Damiano Melotti Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260506070443.1699879-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/tcp_ipv6.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 51583aef0643..d13d49bfef19 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -288,8 +288,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, saddr = &fl6->saddr; err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); - if (err) + if (err) { + dst_release(dst); goto failure; + } } /* set the source address */ -- cgit v1.2.3 From dedf6c90386d99b878763c183a08b61d3ce4824e Mon Sep 17 00:00:00 2001 From: Joey Lu Date: Wed, 6 May 2026 16:46:13 +0800 Subject: net: stmmac: dwmac-nuvoton: fix NULL pointer dereference in nvt_set_phy_intf_sel() priv->dev was never initialized after devm_kzalloc() allocates the private data structure. When nvt_set_phy_intf_sel() is later invoked via the phylink interface_select callback, it calls nvt_gmac_get_delay(priv->dev, ...) which dereferences the NULL pointer. Fix this by assigning priv->dev = dev immediately after allocation. Fixes: 4d7c557f58ef ("net: stmmac: dwmac-nuvoton: Add dwmac glue for Nuvoton MA35 family") Signed-off-by: Joey Lu Link: https://patch.msgid.link/20260506084614.192894-2-a0987203069@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-nuvoton.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-nuvoton.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-nuvoton.c index e2240b68ad98..2ab6ecac6422 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-nuvoton.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-nuvoton.c @@ -100,6 +100,8 @@ static int nvt_gmac_probe(struct platform_device *pdev) if (!priv) return dev_err_probe(dev, -ENOMEM, "Failed to allocate private data\n"); + priv->dev = dev; + priv->regmap = syscon_regmap_lookup_by_phandle_args(dev->of_node, "nuvoton,sys", 1, &priv->macid); if (IS_ERR(priv->regmap)) -- cgit v1.2.3 From b131dc93f7bf1b1461f5bde0c06c4c2384aa5b58 Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Wed, 6 May 2026 09:25:38 +0200 Subject: net: sparx5: fix wrong chip ids for TSN SKUs The TSN SKUs in enum spx5_target_chiptype have incorrect IDs: SPX5_TARGET_CT_7546TSN = 0x47546, SPX5_TARGET_CT_7549TSN = 0x47549, SPX5_TARGET_CT_7552TSN = 0x47552, SPX5_TARGET_CT_7556TSN = 0x47556, SPX5_TARGET_CT_7558TSN = 0x47558, The value read back from the chip is GCB_CHIP_ID_PART_ID, which is a GENMASK(27, 12) field, i.e. at most 16 bits wide. It can never match these IDs, so probing a TSN part fails with a "Target not supported" error. Fix the enum to use the actual 16-bit part IDs returned by the hardware: 0x0546, 0x0549, 0x0552, 0x0556 and 0x0558. Reported-by: Andrew Lunn Fixes: 3cfa11bac9bb ("net: sparx5: add the basic sparx5 driver") Signed-off-by: Daniel Machon Link: https://patch.msgid.link/20260506-misc-fixes-sparx5-lan969x-v2-3-fb236aa96908@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/sparx5/sparx5_main.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h index 6a745bb71b5c..eb57b86fbe22 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h @@ -31,11 +31,11 @@ enum spx5_target_chiptype { SPX5_TARGET_CT_7552 = 0x7552, /* SparX-5-128 Enterprise */ SPX5_TARGET_CT_7556 = 0x7556, /* SparX-5-160 Enterprise */ SPX5_TARGET_CT_7558 = 0x7558, /* SparX-5-200 Enterprise */ - SPX5_TARGET_CT_7546TSN = 0x47546, /* SparX-5-64i Industrial */ - SPX5_TARGET_CT_7549TSN = 0x47549, /* SparX-5-90i Industrial */ - SPX5_TARGET_CT_7552TSN = 0x47552, /* SparX-5-128i Industrial */ - SPX5_TARGET_CT_7556TSN = 0x47556, /* SparX-5-160i Industrial */ - SPX5_TARGET_CT_7558TSN = 0x47558, /* SparX-5-200i Industrial */ + SPX5_TARGET_CT_7546TSN = 0x0546, /* SparX-5-64i Industrial */ + SPX5_TARGET_CT_7549TSN = 0x0549, /* SparX-5-90i Industrial */ + SPX5_TARGET_CT_7552TSN = 0x0552, /* SparX-5-128i Industrial */ + SPX5_TARGET_CT_7556TSN = 0x0556, /* SparX-5-160i Industrial */ + SPX5_TARGET_CT_7558TSN = 0x0558, /* SparX-5-200i Industrial */ SPX5_TARGET_CT_LAN9694 = 0x9694, /* lan969x-40 */ SPX5_TARGET_CT_LAN9691VAO = 0x9691, /* lan969x-40-VAO */ SPX5_TARGET_CT_LAN9694TSN = 0x9695, /* lan969x-40-TSN */ -- cgit v1.2.3 From 41ae14071cd7f6a7770e2fe1f8a0859d4c2c6ba4 Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Wed, 6 May 2026 09:25:39 +0200 Subject: net: sparx5: configure serdes for 1000BASE-X in sparx5_port_init() sparx5_port_init() only invokes sparx5_serdes_set() and the associated shadow-device enable and low-speed device switch for SGMII and QSGMII. On any port with a high-speed primary device (DEV5G/DEV10G/DEV25G) configured for 1000BASE-X the serdes is therefore left uninitialized, the DEV2G5 shadow is never enabled, and the port stays pointed at its high-speed device rather than the DEV2G5. The PCS1G block looks healthy in isolation, but no frames reach the link partner. Add 1000BASE-X to the check so the same three steps run. Note: the same issue might apply to 2500BASE-X, but that will, eventually, be addressed in a separate commit. Reported-by: Andrew Lunn Fixes: 946e7fd5053a ("net: sparx5: add port module support") Signed-off-by: Daniel Machon Link: https://patch.msgid.link/20260506-misc-fixes-sparx5-lan969x-v2-4-fb236aa96908@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/sparx5/sparx5_port.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_port.c b/drivers/net/ethernet/microchip/sparx5/sparx5_port.c index 04bc8fffaf96..62c49893de3c 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_port.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_port.c @@ -1128,7 +1128,8 @@ int sparx5_port_init(struct sparx5 *sparx5, DEV2G5_PCS1G_SD_CFG(port->portno)); if (conf->portmode == PHY_INTERFACE_MODE_QSGMII || - conf->portmode == PHY_INTERFACE_MODE_SGMII) { + conf->portmode == PHY_INTERFACE_MODE_SGMII || + conf->portmode == PHY_INTERFACE_MODE_1000BASEX) { err = sparx5_serdes_set(sparx5, port, conf); if (err) return err; -- cgit v1.2.3