From 68de007d5ac9df0e3f4f187a179c5c842bb5a2be Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Jun 2026 05:56:34 +0000 Subject: xfrm: annotate data-races around xfrm_policy_count[] and xfrm_policy_default[] KCSAN reported a data race involving net->xfrm.policy_count access. Add missing READ_ONCE()/WRITE_ONCE() annotations on xfrm_policy_count and xfrm_policy_default. Fixes: 2518c7c2b3d7 ("[XFRM]: Hash policies when non-prefixed.") Reported-by: syzbot+d85ba1c732720b9a4097@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6a2b9e96.99669fcc.12a77b.0006.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 874409127e29..35a743129329 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1250,8 +1250,8 @@ int __xfrm_policy_check(struct sock *, int dir, struct sk_buff *skb, static inline bool __xfrm_check_nopolicy(struct net *net, struct sk_buff *skb, int dir) { - if (!net->xfrm.policy_count[dir] && !secpath_exists(skb)) - return net->xfrm.policy_default[dir] == XFRM_USERPOLICY_ACCEPT; + if (!READ_ONCE(net->xfrm.policy_count[dir]) && !secpath_exists(skb)) + return READ_ONCE(net->xfrm.policy_default[dir]) == XFRM_USERPOLICY_ACCEPT; return false; } @@ -1351,8 +1351,8 @@ static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family) { struct net *net = dev_net(skb->dev); - if (!net->xfrm.policy_count[XFRM_POLICY_OUT] && - net->xfrm.policy_default[XFRM_POLICY_OUT] == XFRM_USERPOLICY_ACCEPT) + if (!READ_ONCE(net->xfrm.policy_count[XFRM_POLICY_OUT]) && + READ_ONCE(net->xfrm.policy_default[XFRM_POLICY_OUT]) == XFRM_USERPOLICY_ACCEPT) return true; return (skb_dst(skb)->flags & DST_NOXFRM) || -- cgit v1.2.3 From 40f0b1047918539f0b0f795ac65e35336b4c2c78 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 15 Jun 2026 09:02:37 +0000 Subject: xfrm: validate selector family and prefixlen during match syzbot reported a shift-out-of-bounds in xfrm_selector_match() due to AF_UNSPEC selector with large prefixlen (e.g. 128) matched against IPv4 flow (when XFRM_STATE_AF_UNSPEC is set). Fix this by: - Rejecting mismatched families in xfrm_selector_match. - Returning false in addr4_match if prefixlen > 32. - Returning false in addr_match if prefixlen > 128 (prevents overflow). Fixes: 3f0ab59e6537 ("xfrm: validate new SA's prefixlen using SA family when sel.family is unset") Reported-by: syzbot+9383b1ff0df4b29ca5e6@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6a2fbe35.be3f099c.2836ae.0018.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 35a743129329..f8c909b0f0c3 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -943,6 +943,9 @@ static inline bool addr_match(const void *token1, const void *token2, unsigned int pdw; unsigned int pbi; + if (prefixlen > 128) + return false; + pdw = prefixlen >> 5; /* num of whole u32 in prefix */ pbi = prefixlen & 0x1f; /* num of bits in incomplete u32 in prefix */ @@ -967,6 +970,10 @@ static inline bool addr4_match(__be32 a1, __be32 a2, u8 prefixlen) /* C99 6.5.7 (3): u32 << 32 is undefined behaviour */ if (sizeof(long) == 4 && prefixlen == 0) return true; + + if (prefixlen > 32) + return false; + return !((a1 ^ a2) & htonl(~0UL << (32 - prefixlen))); } -- cgit v1.2.3 From 8165f7ff57d9667d2bb477ef6af83ede7fed4ad7 Mon Sep 17 00:00:00 2001 From: Maoyi Xie Date: Fri, 12 Jun 2026 16:59:35 +0800 Subject: net: ip_gre: require CAP_NET_ADMIN in the device netns for changelink A tunnel changelink() operates on at most two netns, dev_net(dev) and the tunnel link netns t->net. They differ once the device is created in or moved to a netns other than the one the request runs in. The rtnl changelink path checks CAP_NET_ADMIN only against dev_net(dev), so a caller privileged there but not in t->net can rewrite a tunnel that lives in t->net. Add rtnl_dev_link_net_capable() next to rtnl_get_net_ns_capable() in net/core/rtnetlink.c. It requires CAP_NET_ADMIN in the link netns and is skipped when the link netns is dev_net(dev), where the rtnl path already checked it. The other patches in this series use the same helper. Gate ipgre_changelink() and erspan_changelink() with it, at the top of the op before any attribute is parsed, because the parsers update live tunnel fields first. ipgre_netlink_parms() sets t->collect_md before ip_tunnel_changelink() runs. Commit 8b484efd5cb4 ("ip6: vti: Use ip6_tnl.net in vti6_siocdevprivate().") added the same check on the ioctl path. This adds it on RTM_NEWLINK. Reported-by: Xiao Liang Closes: https://lore.kernel.org/netdev/CABAhCOSzP1vaThGV35_VnsRCb=87_CPjPVsTHbq905k8A+BuUg@mail.gmail.com/ Fixes: b57708add314 ("gre: add x-netns support") Cc: stable@vger.kernel.org Signed-off-by: Maoyi Xie Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260612085941.3158249-2-maoyixie.tju@gmail.com Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index ec65a8cebb99..2bff41aacc98 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -256,6 +256,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, struct netlink_ext_ack *exterr); struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid); +bool rtnl_dev_link_net_capable(const struct net_device *dev, + const struct net *link_net); #define MODULE_ALIAS_RTNL_LINK(kind) MODULE_ALIAS("rtnl-link-" kind) -- cgit v1.2.3 From 7d8297e26b4e20b5d1c3c3fe51fe81a1c7fbc823 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 15 Jun 2026 15:36:30 -0400 Subject: sctp: hold socket lock when dumping endpoints in sctp_diag SCTP_DIAG endpoint dumping was traversing endpoint address lists without holding lock_sock(), while those lists could change concurrently via socket operations (e.g., bindx changes). This creates a race where nla_reserve() counts addresses under RCU protection, but the subsequent copy may see fewer entries, potentially leaking uninitialized memory to userspace. Fix this by: - Taking a reference on each endpoint during hash traversal - Moving socket operations (lock_sock()) outside read_lock_bh() - Serializing address list access during dump - Reworking sctp_for_each_endpoint() to support restart-based traversal with (net, pos) tracking Also: - Add WARN_ON_ONCE() for inconsistent address counts - Fix idiag_states filtering for LISTEN vs association cases - Skip dumping endpoints being freed (ep->base.dead) - Move dump position tracking into iterator, removing cb->args[4] and its comment for sctp_ep_dump()., - Update the comment for cb->args[4] and remove the comment for unused cb->args[5] for sctp_sock_dump(). Note: traversal is restart-based and may re-scan buckets multiple times, but this is acceptable due to small bucket sizes and required to support sleeping-safe callbacks. This issue was reported by Nico Yip (@_cyeaa_) working with TrendAI Zero Day Initiative. Reported-by: Zero Day Initiative Fixes: 8f840e47f190 ("sctp: add the sctp_diag.c file") Signed-off-by: Xin Long Link: https://patch.msgid.link/4c1b49ab87e0f7d552ebd8172b364b1994e913c9.1781552190.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- include/net/sctp/sctp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 60b073fd3ed8..d50c27812504 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -111,7 +111,8 @@ int sctp_transport_lookup_process(sctp_callback_t cb, struct net *net, const union sctp_addr *paddr, void *p, int dif); int sctp_transport_traverse_process(sctp_callback_t cb, sctp_callback_t cb_done, struct net *net, int *pos, void *p); -int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), void *p); +int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), + struct net *net, int *pos, void *p); int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc, struct sctp_info *info); -- cgit v1.2.3 From c9c9b37f8c5505224e8d206184df3bb668ee00cf Mon Sep 17 00:00:00 2001 From: Haoze Xie Date: Mon, 8 Jun 2026 13:43:44 +0800 Subject: netfilter: nf_queue: pin bridge device while NFQUEUE holds fake dst The br_netfilter fake rtable is embedded in struct net_bridge and is attached to bridged packets with skb_dst_set_noref(). If such a packet is queued to NFQUEUE, __nf_queue() upgrades that fake dst with skb_dst_force(). At that point the queued skb can hold a real dst reference after bridge teardown has started. The problem is not that every bridged packet needs its own dst reference. The problem is that NFQUEUE can keep the bridge private fake dst alive after unregister begins. Fix this by keeping the bridge fake dst model unchanged and pinning the bridge master device only while the packet sits in NFQUEUE. Record the bridge device in nf_queue_entry when the queued skb carries a bridge fake dst, take a device reference for the queue lifetime, and drop it when the queue entry is freed. Also make sure queued entries are reaped when that bridge device goes down, and drop the redundant nf_bridge_info_exists() test from the fake dst detection. This keeps netdev_priv(br->dev) alive until verdict completion, so the embedded fake rtable and its metrics backing storage cannot be freed out from under dst_release(). It also avoids the constant refcount bump and avoids using ipv4-specific dst helpers for IPv6 bridge traffic. Fixes: 34666d467cbf ("netfilter: bridge: move br_netfilter out of the core") Cc: stable@kernel.org Reported-by: Yuan Tan Reported-by: Yifan Wu Reported-by: Juefei Pu Reported-by: Xin Liu Signed-off-by: Haoze Xie Signed-off-by: Ren Wei Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_queue.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 3978c3174cdb..fc3e81c07364 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -18,6 +18,7 @@ struct nf_queue_entry { unsigned int id; unsigned int hook_index; /* index in hook_entries->hook[] */ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + struct net_device *bridge_dev; struct net_device *physin; struct net_device *physout; #endif -- cgit v1.2.3 From bff1c8b49a9cb5c04af20f4e7d43bf4af5863bc6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 18 Jun 2026 08:16:18 +0200 Subject: netfilter: nft_meta_bridge: add validate callback for get operations Blamed commit added NFT_META_BRI_IIFHWADDR to the set validate callback, yet this is a get operation. Add a get validate callback and move the NFT_META_BRI_IIFHWADDR key there. AFAICS this is harmless, NFT_META_BRI_IIFHWADDR can deal with a NULL input device and the set handler ignores a NFT_META_BRI_IIFHWADDR operation, but it allows to read 4 bytes off bridge skb->cb[]. Fixes: cbd2257dc96e ("netfilter: nft_meta_bridge: introduce NFT_META_BRI_IIFHWADDR support") Signed-off-by: Florian Westphal Reviewed-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_meta.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h index f74e63290603..6cf1d910bbf8 100644 --- a/include/net/netfilter/nft_meta.h +++ b/include/net/netfilter/nft_meta.h @@ -40,6 +40,8 @@ void nft_meta_set_eval(const struct nft_expr *expr, void nft_meta_set_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr); +int nft_meta_get_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr); int nft_meta_set_validate(const struct nft_ctx *ctx, const struct nft_expr *expr); -- cgit v1.2.3 From b8b09dc2bf35a00d4e0556b5d6308c7b917ebda2 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 18 Jun 2026 13:56:38 +0200 Subject: netfilter: nf_conntrack_expect: use conntrack GC to reap expectations This patch replaces the timer API by GC worker approach for expectations, as it already happened in many other subsystems. Use the existing conntrack GC worker to iterate over the local list of expectations in the master conntrack to reap expired expectations. Check IPS_HELPER_BIT to run GC for expectations, set it on for nft_ct expectation which nevers sets it. Hold the expectation spinlock while iterating over the master conntrack expectation list to synchronize with nf_ct_remove_expectations(). This also performs runtime packet path garbage collection through the expectation insertion and lookup functions while walking over one of the chains of the global expectation hashtables. Unconfirmed conntrack entries are skipped since ct->ext can be reallocated and dying are skipped since those will be gone soon. Set on IPS_HELPER_BIT if the helper ct extension is added, then the new GC worker does not need to bump the ct refcount to check if the ct->ext helper is available. This removes the extra bump on the refcount for expectation timers, this allows to remove several nf_ct_expect_put() calls after the unlink, after this update only refcount remains at 1 while on the expectation hashes. This patch implicitly addresses a race with the existing timer API allowing an expectation to access a stale exp->master pointer which has been already released when expectation removal loses races with an expiring timer, ie. timer_del() reporting false. Add a new NF_CT_EXPECT_DEAD flag to reap this expectation via GC. This is needed by nf_conntrack_unexpect_related() which is called in error paths to invalidate newly created expectations that has been added into the hashes. These expectactions cannot be inmediately released as GC or nf_ct_remove_expectations() could race to make it. On expectation insert, the runtime GC reaps stale expectations before checking the expectation limit set by policy. Set current timestamp in nf_ct_expect_alloc(), then add the expectation policy timeout (or custom timeout specified added on top of this) to specify the expectation lifetime. Fixes: bffcaad9afdf ("netfilter: ctnetlink: ensure safe access to master conntrack") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_expect.h | 16 +++++++++++++--- include/uapi/linux/netfilter/nf_conntrack_common.h | 1 + 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index 80f50fd0f7ad..be4a120d549e 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -54,8 +54,8 @@ struct nf_conntrack_expect { /* The conntrack of the master connection */ struct nf_conn *master; - /* Timer function; deletes the expectation. */ - struct timer_list timeout; + /* jiffies32 when this expectation expires */ + u32 timeout; #if IS_ENABLED(CONFIG_NF_NAT) union nf_inet_addr saved_addr; @@ -69,6 +69,14 @@ struct nf_conntrack_expect { struct rcu_head rcu; }; +static inline bool nf_ct_exp_is_expired(const struct nf_conntrack_expect *exp) +{ + if (READ_ONCE(exp->flags) & NF_CT_EXPECT_DEAD) + return true; + + return (__s32)(READ_ONCE(exp->timeout) - nfct_time_stamp) <= 0; +} + static inline struct net *nf_ct_exp_net(struct nf_conntrack_expect *exp) { return read_pnet(&exp->net); @@ -130,7 +138,6 @@ static inline void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) void nf_ct_remove_expectations(struct nf_conn *ct); void nf_ct_unexpect_related(struct nf_conntrack_expect *exp); -bool nf_ct_remove_expect(struct nf_conntrack_expect *exp); void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data); void nf_ct_expect_iterate_net(struct net *net, @@ -153,5 +160,8 @@ static inline int nf_ct_expect_related(struct nf_conntrack_expect *expect, return nf_ct_expect_related_report(expect, 0, 0, flags); } +struct nf_conn_help; +void nf_ct_expectation_gc(struct nf_conn_help *master_help); + #endif /*_NF_CONNTRACK_EXPECT_H*/ diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 56b6b60a814f..ee51045ae1d6 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -160,6 +160,7 @@ enum ip_conntrack_expect_events { #define NF_CT_EXPECT_USERSPACE 0x4 #ifdef __KERNEL__ +#define NF_CT_EXPECT_DEAD 0x8 #define NF_CT_EXPECT_MASK (NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE | \ NF_CT_EXPECT_USERSPACE) #endif -- cgit v1.2.3 From 4c6d43db2a4d2cef3921e885cf34798f790d34ea Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 16 Jun 2026 12:03:29 +0200 Subject: net: dst_metadata: fix false-positive memcpy overflow in tun_dst_unclone kmalloc_flex() in metadata_dst_alloc() sets __counted_by for the structure to the options_len, which is then initialized to zero. Later, we're initializing the structure by copying the tunnel info together with the options, and this triggers a warning for a potential memcpy overflow, since the compiler estimates that the options can't fit into the structure, even though the memory for them is actually allocated. memcpy: detected buffer overflow: 104 byte write of buffer size 96 WARNING: CPU: X PID: Y at lib/string_helpers.c:1036 __fortify_report skb_tunnel_info_unclone+0x179/0x190 geneve_xmit+0x7fe/0xe00 The issue is triggered when built with clang and source fortification. Fix that by doing the copy in two stages: first - the main data with the options_len, then the options. This way the correct length should be known at the time of the copy. It would be better if the options_len never changed after allocation, but the allocation code is a little separate from the initialization and it would be awkward and potentially dangerous to return a struct with options_len set to a non-zero value from the metadata_dst_alloc(). Another option would be to use ip_tunnel_info_opts_set(), but it is doing too many unnecessary operations for the use case here. Fixes: 69050f8d6d07 ("treewide: Replace kmalloc with kmalloc_obj for non-scalar types") Reported-by: Johan Thomsen Closes: https://lore.kernel.org/netdev/CAKv6aAM8_EWgXScnKmKYm_4SwGDVBK++dzfP+Y6msUXbp99QUw@mail.gmail.com/ Signed-off-by: Ilya Maximets Link: https://patch.msgid.link/20260616100332.1308294-1-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- include/net/dst_metadata.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 1fc2fb03ce3f..f45d1e3163f0 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -164,8 +164,11 @@ static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb) if (!new_md) return ERR_PTR(-ENOMEM); - memcpy(&new_md->u.tun_info, &md_dst->u.tun_info, - sizeof(struct ip_tunnel_info) + md_size); + /* Copy in two stages to keep the __counted_by happy. */ + new_md->u.tun_info = md_dst->u.tun_info; + memcpy(ip_tunnel_info_opts(&new_md->u.tun_info), + ip_tunnel_info_opts(&md_dst->u.tun_info), md_size); + #ifdef CONFIG_DST_CACHE /* Unclone the dst cache if there is one */ if (new_md->u.tun_info.dst_cache.cache) { -- cgit v1.2.3 From b72f0db64205d9ce462038ba995d5d31eff32dc1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 19 Jun 2026 21:27:20 +0000 Subject: ipv4: fib: Don't ignore error route in local/main tables. When CONFIG_IP_MULTIPLE_TABLES is enabled but no rule is added, fib_lookup() performs route lookup directly on two tables. Since the first lookup does not properly bail out, the result of an error route in the merged local/main table could be overwritten by another route in the default table: # unshare -n # ip link set lo up # ip route add 192.168.0.0/24 dev lo table 253 # ip route add unreachable 192.168.0.0/24 # ip route get 192.168.0.1 192.168.0.1 dev lo table default uid 0 cache Once a random rule is added, the error route is respected: # ip rule add table 0 # ip rule del table 0 # ip route get 192.168.0.1 RTNETLINK answers: No route to host Let's fix the inconsistent behaviour. Fixes: f4530fa574df ("ipv4: Avoid overhead when no custom FIB rules are installed.") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20260619212753.3367244-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/ip_fib.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index a71a98505650..c63a3c4967ae 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -374,7 +374,7 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags) { struct fib_table *tb; - int err = -ENETUNREACH; + int err = -EAGAIN; flags |= FIB_LOOKUP_NOREF; if (net->ipv4.fib_has_custom_rules) @@ -388,17 +388,16 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp, if (tb) err = fib_table_lookup(tb, flp, res, flags); - if (!err) + if (err != -EAGAIN) goto out; tb = rcu_dereference_rtnl(net->ipv4.fib_default); if (tb) err = fib_table_lookup(tb, flp, res, flags); -out: if (err == -EAGAIN) err = -ENETUNREACH; - +out: rcu_read_unlock(); return err; -- cgit v1.2.3 From 22f9dbed18bcc865d750ed109c6ae2dd4cf2af55 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 13 Jun 2026 22:25:24 -0700 Subject: netfilter: x_tables.h: fix all kernel-doc warnings - use correct names in kernel-doc comments - add missing struct members to kernel-doc comments Warning: include/linux/netfilter/x_tables.h:41 struct member 'targinfo' not described in 'xt_action_param' Warning: include/linux/netfilter/x_tables.h:41 Excess struct member 'targetinfo' description in 'xt_action_param' Warning: include/linux/netfilter/x_tables.h:90 struct member 'family' not described in 'xt_mtchk_param' Warning: include/linux/netfilter/x_tables.h:90 struct member 'nft_compat' not described in 'xt_mtchk_param' Warning: include/linux/netfilter/x_tables.h:101 expecting prototype for struct xt_mdtor_param. Prototype was for struct xt_mtdtor_param instead Warning: include/linux/netfilter/x_tables.h:121 struct member 'net' not described in 'xt_tgchk_param' Warning: include/linux/netfilter/x_tables.h:121 struct member 'table' not described in 'xt_tgchk_param' Warning: include/linux/netfilter/x_tables.h:121 struct member 'target' not described in 'xt_tgchk_param' Warning: include/linux/netfilter/x_tables.h:121 struct member 'targinfo' not described in 'xt_tgchk_param' Warning: include/linux/netfilter/x_tables.h:121 struct member 'hook_mask' not described in 'xt_tgchk_param' Warning: include/linux/netfilter/x_tables.h:121 struct member 'family' not described in 'xt_tgchk_param' Warning: include/linux/netfilter/x_tables.h:121 struct member 'nft_compat' not described in 'xt_tgchk_param' Warning: include/linux/netfilter/x_tables.h:345 expecting prototype for xt_recseq(). Prototype was for DECLARE_PER_CPU() instead Signed-off-by: Randy Dunlap Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 20d70dddbe50..25062f4a0dd5 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -18,7 +18,7 @@ * @match: the match extension * @target: the target extension * @matchinfo: per-match data - * @targetinfo: per-target data + * @targinfo: per-target data * @state: pointer to hook state this packet came from * @fragoff: packet is a fragment, this is the data offset * @thoff: position of transport header relative to skb->data @@ -77,7 +77,9 @@ static inline u_int8_t xt_family(const struct xt_action_param *par) * @match: struct xt_match through which this function was invoked * @matchinfo: per-match data * @hook_mask: via which hooks the new rule is reachable - * Other fields as above. + * @family: actual NFPROTO_* through which the function is invoked + * (helpful when match->family == NFPROTO_UNSPEC) + * @nft_compat: running from the nft compat layer if true */ struct xt_mtchk_param { struct net *net; @@ -91,8 +93,13 @@ struct xt_mtchk_param { }; /** - * struct xt_mdtor_param - match destructor parameters - * Fields as above. + * struct xt_mtdtor_param - match destructor parameters + * + * @net: network namespace through which the check was invoked + * @match: struct xt_match through which this function was invoked + * @matchinfo: per-match data + * @family: actual NFPROTO_* through which the function is invoked + * (helpful when match->family == NFPROTO_UNSPEC) */ struct xt_mtdtor_param { struct net *net; @@ -105,10 +112,16 @@ struct xt_mtdtor_param { * struct xt_tgchk_param - parameters for target extensions' * checkentry functions * + * @net: network namespace through which the check was invoked + * @table: table the rule is tried to be inserted into * @entryinfo: the family-specific rule data * (struct ipt_entry, ip6t_entry, arpt_entry, ebt_entry) - * - * Other fields see above. + * @target: the target extension + * @targinfo: per-target data + * @hook_mask: via which hooks the new rule is reachable + * @family: actual NFPROTO_* through which the function is invoked + * (helpful when match->family == NFPROTO_UNSPEC) + * @nft_compat: running from the nft compat layer if true */ struct xt_tgchk_param { struct net *net; @@ -336,9 +349,9 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size); void xt_free_table_info(struct xt_table_info *info); /** - * xt_recseq - recursive seqcount for netfilter use + * var xt_recseq - recursive seqcount for netfilter use * - * Packet processing changes the seqcount only if no recursion happened + * Packet processing changes the seqcount only if no recursion happened. * get_counters() can use read_seqcount_begin()/read_seqcount_retry(), * because we use the normal seqcount convention : * Low order bit set to 1 if a writer is active. -- cgit v1.2.3 From 57f940017a777aadf38b99db44cf35f727c26f4c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 12 Jun 2026 08:03:50 +0200 Subject: netfilter: conntrack: add deprecation warnings for irc and pptp trackers IRC Direct client-to-client requires plaintext. IRC over TLS should be preferred, making this helper ineffective. Add a deprecation warning and update the help text to better reflect that this is needed for the DCC extension, not IRC itself. PPTP is esoteric these days and it is the only helper that requires the destroy callback in the conntrack helper API. Removal would simplify the conntrack core. Both helpers are IPv4 only. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_helper.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index 81025101f86d..c761cd8158b2 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -114,6 +114,10 @@ int nf_conntrack_helpers_register(struct nf_conntrack_helper *, unsigned int, void nf_conntrack_helpers_unregister(struct nf_conntrack_helper **, unsigned int); +#define nf_conntrack_helper_deprecated(name) \ + pr_warn("The %s conntrack helper is scheduled for removal.\n" \ + "Please contact the netfilter-devel mailing list if you still need this.\n", name) + struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp); int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, -- cgit v1.2.3 From 979c13114c0bb6ab9135e2c93e00c79c412aef09 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 22 Jun 2026 21:35:14 +0200 Subject: netfilter: nf_conntrack_expect: store master_tuple in expectation Store master conntrack tuple in the expectation since exp->master might refer to a different conntrack when accessed from rcu read side lock area due to typesafe rcu rules. Fixes: 02a3231b6d82 ("netfilter: nf_conntrack_expect: store netns and zone in expectation") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_expect.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index be4a120d549e..c024345c9bd8 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -26,6 +26,7 @@ struct nf_conntrack_expect { possible_net_t net; /* We expect this tuple, with the following mask */ + struct nf_conntrack_tuple master_tuple; struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_mask mask; -- cgit v1.2.3 From 1105ef941c1a28e115d1b97f17e1c85576884100 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Jun 2026 12:04:39 -0700 Subject: net: ethtool: keep rtnl_lock for ops using ethtool_op_get_link() Breno reports following splats on mlx5: RTNL: assertion failed at net/core/dev.c (2241) WARNING: net/core/dev.c:2241 at netif_state_change+0xed/0x130, CPU#5: ethtool/1335 RIP: 0010:netif_state_change+0xf9/0x130 Call Trace: __linkwatch_sync_dev+0xea/0x120 ethtool_op_get_link+0xe/0x20 __ethtool_get_link+0x26/0x40 linkstate_prepare_data+0x51/0x200 ethnl_default_doit+0x213/0x470 genl_family_rcv_msg_doit+0xdd/0x110 Looks like I missed ethtool_op_get_link() trying to sync linkwatch, which needs rtnl_lock. Not all drivers do this - bnxt doesn't, it just returns the link state, so add an opt-in bit. Reported-by: Breno Leitao Fixes: 45079e00133e ("net: ethtool: optionally skip rtnl_lock on Netlink path for GET ops") Acked-by: Stanislav Fomichev Reviewed-by: Breno Leitao Acked-by: Harshitha Ramamurthy Link: https://patch.msgid.link/20260624190439.2521219-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 1b834e2a522e..5d491a98265e 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -942,6 +942,7 @@ struct kernel_ethtool_ts_info { #define ETHTOOL_OP_NEEDS_RTNL_GPAUSEPARAM BIT(5) #define ETHTOOL_OP_NEEDS_RTNL_SPAUSEPARAM BIT(6) #define ETHTOOL_OP_NEEDS_RTNL_RSS BIT(7) +#define ETHTOOL_OP_NEEDS_RTNL_GLINK BIT(8) /** * struct ethtool_ops - optional netdev operations @@ -978,6 +979,7 @@ struct kernel_ethtool_ts_info { * - phylink helpers (note that phydev is currently unsupported!) * - netdev_update_features() * - netif_set_real_num_tx_queues() + * - ethtool_op_get_link() (syncs link watch under rtnl_lock) * * @get_drvinfo: Report driver/device information. Modern drivers no * longer have to implement this callback. Most fields are -- cgit v1.2.3 From 12c765be84d28f22deca10e775889f54bd571a85 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Jun 2026 11:20:15 -0700 Subject: net: turn the rx_mode work into a generic netdev_work facility The rx_mode update runs from a workqueue: drivers have their ndo_set_rx_mode_async() callback executed by a single global work item under RTNL and ops lock. This is a useful pattern. Support multiple "events" that need to be serviced and make RX_MODE sync the first one. Call the events "core" because later on we will let drivers define and schedule their own. Reviewed-by: Kuniyuki Iwashima Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20260624182018.2445732-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b67a12541eac..732506787db3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1930,8 +1930,9 @@ enum netdev_reg_state { * has been enabled due to the need to listen to * additional unicast addresses in a device that * does not implement ndo_set_rx_mode() - * @rx_mode_node: List entry for rx_mode work processing - * @rx_mode_tracker: Refcount tracker for rx_mode work + * @work_node: List entry for async netdev_work processing + * @work_tracker: Refcount tracker for async netdev_work + * @work_core_pending: Core-defined pending netdev_work (NETDEV_WORK_*) * @rx_mode_addr_cache: Recycled snapshot entries for rx_mode work * @rx_mode_retry_timer: Timer that re-queues rx_mode work after failure * @rx_mode_retry_count: Number of consecutive retries already scheduled @@ -2326,8 +2327,9 @@ struct net_device { unsigned int promiscuity; unsigned int allmulti; bool uc_promisc; - struct list_head rx_mode_node; - netdevice_tracker rx_mode_tracker; + struct list_head work_node; + netdevice_tracker work_tracker; + unsigned long work_core_pending; struct netdev_hw_addr_list rx_mode_addr_cache; struct timer_list rx_mode_retry_timer; unsigned int rx_mode_retry_count; -- cgit v1.2.3 From 129cdce9da9e44c52d38889e0411be9817bca114 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Jun 2026 11:20:16 -0700 Subject: net: add the driver-facing netdev_work scheduling API With an extra event mask we can easily extend the netdev work to also service driver-defined events. For advanced drivers this is probably not a perfect match, but it makes running deferred work easier in simple cases. Expose the netdev_work facility to drivers. Add helpers to schedule work and a dedicated ndo to perform the driver- -scheduled actions. Reviewed-by: Kuniyuki Iwashima Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20260624182018.2445732-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 732506787db3..9981d637f8b5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1131,6 +1131,9 @@ struct netdev_net_notifier { * netdev_hw_addr_list_for_each(ha, uc). Return 0 on success or a * negative errno to request a retry via the core backoff. * + * void (*ndo_work)(struct net_device *dev, unsigned long events); + * Run deferred work scheduled with netdev_work_sched(@events). + * * int (*ndo_set_mac_address)(struct net_device *dev, void *addr); * This function is called when the Media Access Control address * needs to be changed. If this interface is not defined, the @@ -1460,6 +1463,8 @@ struct net_device_ops { struct net_device *dev, struct netdev_hw_addr_list *uc, struct netdev_hw_addr_list *mc); + void (*ndo_work)(struct net_device *dev, + unsigned long events); int (*ndo_set_mac_address)(struct net_device *dev, void *addr); int (*ndo_validate_addr)(struct net_device *dev); @@ -1932,6 +1937,8 @@ enum netdev_reg_state { * does not implement ndo_set_rx_mode() * @work_node: List entry for async netdev_work processing * @work_tracker: Refcount tracker for async netdev_work + * @work_pending: Driver-defined pending netdev_work, passed to + * ndo_work() (see netdev_work_sched()) * @work_core_pending: Core-defined pending netdev_work (NETDEV_WORK_*) * @rx_mode_addr_cache: Recycled snapshot entries for rx_mode work * @rx_mode_retry_timer: Timer that re-queues rx_mode work after failure @@ -2329,6 +2336,7 @@ struct net_device { bool uc_promisc; struct list_head work_node; netdevice_tracker work_tracker; + unsigned long work_pending; unsigned long work_core_pending; struct netdev_hw_addr_list rx_mode_addr_cache; struct timer_list rx_mode_retry_timer; @@ -5178,6 +5186,9 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, const struct pcpu_sw_netstats __percpu *netstats); void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); +void netdev_work_sched(struct net_device *dev, unsigned long events); +unsigned long netdev_work_cancel(struct net_device *dev, unsigned long mask); + enum { NESTED_SYNC_IMM_BIT, NESTED_SYNC_TODO_BIT, -- cgit v1.2.3