From 68de007d5ac9df0e3f4f187a179c5c842bb5a2be Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 12 Jun 2026 05:56:34 +0000
Subject: xfrm: annotate data-races around xfrm_policy_count[] and
 xfrm_policy_default[]

KCSAN reported a data race involving net->xfrm.policy_count access.

Add missing READ_ONCE()/WRITE_ONCE() annotations on
xfrm_policy_count and xfrm_policy_default.

Fixes: 2518c7c2b3d7 ("[XFRM]: Hash policies when non-prefixed.")
Reported-by: syzbot+d85ba1c732720b9a4097@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/netdev/6a2b9e96.99669fcc.12a77b.0006.GAE@google.com/T/#u
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 874409127e29..35a743129329 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1250,8 +1250,8 @@ int __xfrm_policy_check(struct sock *, int dir, struct sk_buff *skb,
 static inline bool __xfrm_check_nopolicy(struct net *net, struct sk_buff *skb,
 					 int dir)
 {
-	if (!net->xfrm.policy_count[dir] && !secpath_exists(skb))
-		return net->xfrm.policy_default[dir] == XFRM_USERPOLICY_ACCEPT;
+	if (!READ_ONCE(net->xfrm.policy_count[dir]) && !secpath_exists(skb))
+		return READ_ONCE(net->xfrm.policy_default[dir]) == XFRM_USERPOLICY_ACCEPT;
 
 	return false;
 }
@@ -1351,8 +1351,8 @@ static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family)
 {
 	struct net *net = dev_net(skb->dev);
 
-	if (!net->xfrm.policy_count[XFRM_POLICY_OUT] &&
-	    net->xfrm.policy_default[XFRM_POLICY_OUT] == XFRM_USERPOLICY_ACCEPT)
+	if (!READ_ONCE(net->xfrm.policy_count[XFRM_POLICY_OUT]) &&
+	    READ_ONCE(net->xfrm.policy_default[XFRM_POLICY_OUT]) == XFRM_USERPOLICY_ACCEPT)
 		return true;
 
 	return (skb_dst(skb)->flags & DST_NOXFRM) ||
-- 
cgit v1.2.3


From 40f0b1047918539f0b0f795ac65e35336b4c2c78 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 15 Jun 2026 09:02:37 +0000
Subject: xfrm: validate selector family and prefixlen during match

syzbot reported a shift-out-of-bounds in xfrm_selector_match()
due to AF_UNSPEC selector with large prefixlen (e.g. 128) matched
against IPv4 flow (when XFRM_STATE_AF_UNSPEC is set).

Fix this by:

- Rejecting mismatched families in xfrm_selector_match.
- Returning false in addr4_match if prefixlen > 32.
- Returning false in addr_match if prefixlen > 128 (prevents overflow).

Fixes: 3f0ab59e6537 ("xfrm: validate new SA's prefixlen using SA family when sel.family is unset")
Reported-by: syzbot+9383b1ff0df4b29ca5e6@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/netdev/6a2fbe35.be3f099c.2836ae.0018.GAE@google.com/T/#u
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 35a743129329..f8c909b0f0c3 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -943,6 +943,9 @@ static inline bool addr_match(const void *token1, const void *token2,
 	unsigned int pdw;
 	unsigned int pbi;
 
+	if (prefixlen > 128)
+		return false;
+
 	pdw = prefixlen >> 5;	  /* num of whole u32 in prefix */
 	pbi = prefixlen &  0x1f;  /* num of bits in incomplete u32 in prefix */
 
@@ -967,6 +970,10 @@ static inline bool addr4_match(__be32 a1, __be32 a2, u8 prefixlen)
 	/* C99 6.5.7 (3): u32 << 32 is undefined behaviour */
 	if (sizeof(long) == 4 && prefixlen == 0)
 		return true;
+
+	if (prefixlen > 32)
+		return false;
+
 	return !((a1 ^ a2) & htonl(~0UL << (32 - prefixlen)));
 }
 
-- 
cgit v1.2.3


From 8165f7ff57d9667d2bb477ef6af83ede7fed4ad7 Mon Sep 17 00:00:00 2001
From: Maoyi Xie <maoyixie.tju@gmail.com>
Date: Fri, 12 Jun 2026 16:59:35 +0800
Subject: net: ip_gre: require CAP_NET_ADMIN in the device netns for changelink

A tunnel changelink() operates on at most two netns, dev_net(dev) and
the tunnel link netns t->net. They differ once the device is created in
or moved to a netns other than the one the request runs in. The rtnl
changelink path checks CAP_NET_ADMIN only against dev_net(dev), so a
caller privileged there but not in t->net can rewrite a tunnel that
lives in t->net.

Add rtnl_dev_link_net_capable() next to rtnl_get_net_ns_capable() in
net/core/rtnetlink.c. It requires CAP_NET_ADMIN in the link netns and is
skipped when the link netns is dev_net(dev), where the rtnl path already
checked it. The other patches in this series use the same helper.

Gate ipgre_changelink() and erspan_changelink() with it, at the top of
the op before any attribute is parsed, because the parsers update live
tunnel fields first. ipgre_netlink_parms() sets t->collect_md before
ip_tunnel_changelink() runs.

Commit 8b484efd5cb4 ("ip6: vti: Use ip6_tnl.net in
vti6_siocdevprivate().") added the same check on the ioctl path. This
adds it on RTM_NEWLINK.

Reported-by: Xiao Liang <shaw.leon@gmail.com>
Closes: https://lore.kernel.org/netdev/CABAhCOSzP1vaThGV35_VnsRCb=87_CPjPVsTHbq905k8A+BuUg@mail.gmail.com/
Fixes: b57708add314 ("gre: add x-netns support")
Cc: stable@vger.kernel.org
Signed-off-by: Maoyi Xie <maoyixie.tju@gmail.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20260612085941.3158249-2-maoyixie.tju@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/rtnetlink.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index ec65a8cebb99..2bff41aacc98 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -256,6 +256,8 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm,
 int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer,
 			     struct netlink_ext_ack *exterr);
 struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid);
+bool rtnl_dev_link_net_capable(const struct net_device *dev,
+			       const struct net *link_net);
 
 #define MODULE_ALIAS_RTNL_LINK(kind) MODULE_ALIAS("rtnl-link-" kind)
 
-- 
cgit v1.2.3


From 7d8297e26b4e20b5d1c3c3fe51fe81a1c7fbc823 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 15 Jun 2026 15:36:30 -0400
Subject: sctp: hold socket lock when dumping endpoints in sctp_diag

SCTP_DIAG endpoint dumping was traversing endpoint address lists without
holding lock_sock(), while those lists could change concurrently via
socket operations (e.g., bindx changes). This creates a race where
nla_reserve() counts addresses under RCU protection, but the subsequent
copy may see fewer entries, potentially leaking uninitialized memory to
userspace.

Fix this by:

- Taking a reference on each endpoint during hash traversal
- Moving socket operations (lock_sock()) outside read_lock_bh()
- Serializing address list access during dump
- Reworking sctp_for_each_endpoint() to support restart-based traversal
  with (net, pos) tracking

Also:

- Add WARN_ON_ONCE() for inconsistent address counts
- Fix idiag_states filtering for LISTEN vs association cases
- Skip dumping endpoints being freed (ep->base.dead)
- Move dump position tracking into iterator, removing cb->args[4] and
  its comment for sctp_ep_dump().,
- Update the comment for cb->args[4] and remove the comment for unused
  cb->args[5] for sctp_sock_dump().

Note: traversal is restart-based and may re-scan buckets multiple times,
but this is acceptable due to small bucket sizes and required to support
sleeping-safe callbacks.

This issue was reported by Nico Yip (@_cyeaa_) working with TrendAI Zero
Day Initiative.

Reported-by: Zero Day Initiative <zdi-disclosures@trendmicro.com>
Fixes: 8f840e47f190 ("sctp: add the sctp_diag.c file")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Link: https://patch.msgid.link/4c1b49ab87e0f7d552ebd8172b364b1994e913c9.1781552190.git.lucien.xin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sctp/sctp.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 60b073fd3ed8..d50c27812504 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -111,7 +111,8 @@ int sctp_transport_lookup_process(sctp_callback_t cb, struct net *net,
 				  const union sctp_addr *paddr, void *p, int dif);
 int sctp_transport_traverse_process(sctp_callback_t cb, sctp_callback_t cb_done,
 				    struct net *net, int *pos, void *p);
-int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), void *p);
+int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *),
+			   struct net *net, int *pos, void *p);
 int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
 		       struct sctp_info *info);
 
-- 
cgit v1.2.3


From c9c9b37f8c5505224e8d206184df3bb668ee00cf Mon Sep 17 00:00:00 2001
From: Haoze Xie <royenheart@gmail.com>
Date: Mon, 8 Jun 2026 13:43:44 +0800
Subject: netfilter: nf_queue: pin bridge device while NFQUEUE holds fake dst

The br_netfilter fake rtable is embedded in struct net_bridge and is
attached to bridged packets with skb_dst_set_noref(). If such a packet is
queued to NFQUEUE, __nf_queue() upgrades that fake dst with
skb_dst_force().

At that point the queued skb can hold a real dst reference after bridge
teardown has started. The problem is not that every bridged packet needs
its own dst reference. The problem is that NFQUEUE can keep the bridge
private fake dst alive after unregister begins.

Fix this by keeping the bridge fake dst model unchanged and pinning the
bridge master device only while the packet sits in NFQUEUE. Record the
bridge device in nf_queue_entry when the queued skb carries a bridge fake
dst, take a device reference for the queue lifetime, and drop it when the
queue entry is freed.

Also make sure queued entries are reaped when that bridge device goes
down, and drop the redundant nf_bridge_info_exists() test from the fake
dst detection.

This keeps netdev_priv(br->dev) alive until verdict completion, so the
embedded fake rtable and its metrics backing storage cannot be freed out
from under dst_release(). It also avoids the constant refcount bump and
avoids using ipv4-specific dst helpers for IPv6 bridge traffic.

Fixes: 34666d467cbf ("netfilter: bridge: move br_netfilter out of the core")
Cc: stable@kernel.org
Reported-by: Yuan Tan <yuantan098@gmail.com>
Reported-by: Yifan Wu <yifanwucs@gmail.com>
Reported-by: Juefei Pu <tomapufckgml@gmail.com>
Reported-by: Xin Liu <bird@lzu.edu.cn>
Signed-off-by: Haoze Xie <royenheart@gmail.com>
Signed-off-by: Ren Wei <n05ec@lzu.edu.cn>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_queue.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index 3978c3174cdb..fc3e81c07364 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -18,6 +18,7 @@ struct nf_queue_entry {
 	unsigned int		id;
 	unsigned int		hook_index;	/* index in hook_entries->hook[] */
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+	struct net_device	*bridge_dev;
 	struct net_device	*physin;
 	struct net_device	*physout;
 #endif
-- 
cgit v1.2.3


From bff1c8b49a9cb5c04af20f4e7d43bf4af5863bc6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 18 Jun 2026 08:16:18 +0200
Subject: netfilter: nft_meta_bridge: add validate callback for get operations

Blamed commit added NFT_META_BRI_IIFHWADDR to the set validate callback,
yet this is a get operation.

Add a get validate callback and move the NFT_META_BRI_IIFHWADDR key
there.

AFAICS this is harmless, NFT_META_BRI_IIFHWADDR can deal with a NULL
input device and the set handler ignores a NFT_META_BRI_IIFHWADDR
operation, but it allows to read 4 bytes off bridge skb->cb[].

Fixes: cbd2257dc96e ("netfilter: nft_meta_bridge: introduce NFT_META_BRI_IIFHWADDR support")
Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Fernando Fernandez Mancera <fmancera@suse.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_meta.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h
index f74e63290603..6cf1d910bbf8 100644
--- a/include/net/netfilter/nft_meta.h
+++ b/include/net/netfilter/nft_meta.h
@@ -40,6 +40,8 @@ void nft_meta_set_eval(const struct nft_expr *expr,
 void nft_meta_set_destroy(const struct nft_ctx *ctx,
 			  const struct nft_expr *expr);
 
+int nft_meta_get_validate(const struct nft_ctx *ctx,
+			  const struct nft_expr *expr);
 int nft_meta_set_validate(const struct nft_ctx *ctx,
 			  const struct nft_expr *expr);
 
-- 
cgit v1.2.3


From b8b09dc2bf35a00d4e0556b5d6308c7b917ebda2 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 18 Jun 2026 13:56:38 +0200
Subject: netfilter: nf_conntrack_expect: use conntrack GC to reap expectations

This patch replaces the timer API by GC worker approach for
expectations, as it already happened in many other subsystems.

Use the existing conntrack GC worker to iterate over the local list of
expectations in the master conntrack to reap expired expectations.
Check IPS_HELPER_BIT to run GC for expectations, set it on for nft_ct
expectation which nevers sets it. Hold the expectation spinlock while
iterating over the master conntrack expectation list to synchronize with
nf_ct_remove_expectations(). This also performs runtime packet path
garbage collection through the expectation insertion and lookup
functions while walking over one of the chains of the global expectation
hashtables. Unconfirmed conntrack entries are skipped since ct->ext can
be reallocated and dying are skipped since those will be gone soon.
Set on IPS_HELPER_BIT if the helper ct extension is added, then the new
GC worker does not need to bump the ct refcount to check if the ct->ext
helper is available.

This removes the extra bump on the refcount for expectation timers, this
allows to remove several nf_ct_expect_put() calls after the unlink,
after this update only refcount remains at 1 while on the expectation
hashes.

This patch implicitly addresses a race with the existing timer API
allowing an expectation to access a stale exp->master pointer which has
been already released when expectation removal loses races with an
expiring timer, ie. timer_del() reporting false.

Add a new NF_CT_EXPECT_DEAD flag to reap this expectation via GC. This
is needed by nf_conntrack_unexpect_related() which is called in error
paths to invalidate newly created expectations that has been added into
the hashes. These expectactions cannot be inmediately released as GC or
nf_ct_remove_expectations() could race to make it. On expectation
insert, the runtime GC reaps stale expectations before checking the
expectation limit set by policy.

Set current timestamp in nf_ct_expect_alloc(), then add the expectation
policy timeout (or custom timeout specified added on top of this) to
specify the expectation lifetime.

Fixes: bffcaad9afdf ("netfilter: ctnetlink: ensure safe access to master conntrack")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_expect.h        | 16 +++++++++++++---
 include/uapi/linux/netfilter/nf_conntrack_common.h |  1 +
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h
index 80f50fd0f7ad..be4a120d549e 100644
--- a/include/net/netfilter/nf_conntrack_expect.h
+++ b/include/net/netfilter/nf_conntrack_expect.h
@@ -54,8 +54,8 @@ struct nf_conntrack_expect {
 	/* The conntrack of the master connection */
 	struct nf_conn *master;
 
-	/* Timer function; deletes the expectation. */
-	struct timer_list timeout;
+	/* jiffies32 when this expectation expires */
+	u32 timeout;
 
 #if IS_ENABLED(CONFIG_NF_NAT)
 	union nf_inet_addr saved_addr;
@@ -69,6 +69,14 @@ struct nf_conntrack_expect {
 	struct rcu_head rcu;
 };
 
+static inline bool nf_ct_exp_is_expired(const struct nf_conntrack_expect *exp)
+{
+	if (READ_ONCE(exp->flags) & NF_CT_EXPECT_DEAD)
+		return true;
+
+	return (__s32)(READ_ONCE(exp->timeout) - nfct_time_stamp) <= 0;
+}
+
 static inline struct net *nf_ct_exp_net(struct nf_conntrack_expect *exp)
 {
 	return read_pnet(&exp->net);
@@ -130,7 +138,6 @@ static inline void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
 
 void nf_ct_remove_expectations(struct nf_conn *ct);
 void nf_ct_unexpect_related(struct nf_conntrack_expect *exp);
-bool nf_ct_remove_expect(struct nf_conntrack_expect *exp);
 
 void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data);
 void nf_ct_expect_iterate_net(struct net *net,
@@ -153,5 +160,8 @@ static inline int nf_ct_expect_related(struct nf_conntrack_expect *expect,
 	return nf_ct_expect_related_report(expect, 0, 0, flags);
 }
 
+struct nf_conn_help;
+void nf_ct_expectation_gc(struct nf_conn_help *master_help);
+
 #endif /*_NF_CONNTRACK_EXPECT_H*/
 
diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 56b6b60a814f..ee51045ae1d6 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -160,6 +160,7 @@ enum ip_conntrack_expect_events {
 #define NF_CT_EXPECT_USERSPACE		0x4
 
 #ifdef __KERNEL__
+#define NF_CT_EXPECT_DEAD		0x8
 #define NF_CT_EXPECT_MASK	(NF_CT_EXPECT_PERMANENT | NF_CT_EXPECT_INACTIVE | \
 				 NF_CT_EXPECT_USERSPACE)
 #endif
-- 
cgit v1.2.3


From 4c6d43db2a4d2cef3921e885cf34798f790d34ea Mon Sep 17 00:00:00 2001
From: Ilya Maximets <i.maximets@ovn.org>
Date: Tue, 16 Jun 2026 12:03:29 +0200
Subject: net: dst_metadata: fix false-positive memcpy overflow in
 tun_dst_unclone

kmalloc_flex() in metadata_dst_alloc() sets __counted_by for the
structure to the options_len, which is then initialized to zero.
Later, we're initializing the structure by copying the tunnel info
together with the options, and this triggers a warning for a potential
memcpy overflow, since the compiler estimates that the options can't
fit into the structure, even though the memory for them is actually
allocated.

 memcpy: detected buffer overflow: 104 byte write of buffer size 96
 WARNING: CPU: X PID: Y at lib/string_helpers.c:1036 __fortify_report
  skb_tunnel_info_unclone+0x179/0x190
  geneve_xmit+0x7fe/0xe00

The issue is triggered when built with clang and source fortification.

Fix that by doing the copy in two stages: first - the main data with
the options_len, then the options.  This way the correct length should
be known at the time of the copy.

It would be better if the options_len never changed after allocation,
but the allocation code is a little separate from the initialization
and it would be awkward and potentially dangerous to return a struct
with options_len set to a non-zero value from the metadata_dst_alloc().

Another option would be to use ip_tunnel_info_opts_set(), but it is
doing too many unnecessary operations for the use case here.

Fixes: 69050f8d6d07 ("treewide: Replace kmalloc with kmalloc_obj for non-scalar types")
Reported-by: Johan Thomsen <write@ownrisk.dk>
Closes: https://lore.kernel.org/netdev/CAKv6aAM8_EWgXScnKmKYm_4SwGDVBK++dzfP+Y6msUXbp99QUw@mail.gmail.com/
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Link: https://patch.msgid.link/20260616100332.1308294-1-i.maximets@ovn.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dst_metadata.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 1fc2fb03ce3f..f45d1e3163f0 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -164,8 +164,11 @@ static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb)
 	if (!new_md)
 		return ERR_PTR(-ENOMEM);
 
-	memcpy(&new_md->u.tun_info, &md_dst->u.tun_info,
-	       sizeof(struct ip_tunnel_info) + md_size);
+	/* Copy in two stages to keep the __counted_by happy. */
+	new_md->u.tun_info = md_dst->u.tun_info;
+	memcpy(ip_tunnel_info_opts(&new_md->u.tun_info),
+	       ip_tunnel_info_opts(&md_dst->u.tun_info), md_size);
+
 #ifdef CONFIG_DST_CACHE
 	/* Unclone the dst cache if there is one */
 	if (new_md->u.tun_info.dst_cache.cache) {
-- 
cgit v1.2.3


From b72f0db64205d9ce462038ba995d5d31eff32dc1 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: Fri, 19 Jun 2026 21:27:20 +0000
Subject: ipv4: fib: Don't ignore error route in local/main tables.

When CONFIG_IP_MULTIPLE_TABLES is enabled but no rule is added,
fib_lookup() performs route lookup directly on two tables.

Since the first lookup does not properly bail out, the result
of an error route in the merged local/main table could be
overwritten by another route in the default table:

  # unshare -n
  # ip link set lo up
  # ip route add 192.168.0.0/24 dev lo table 253
  # ip route add unreachable 192.168.0.0/24
  # ip route get 192.168.0.1
  192.168.0.1 dev lo table default uid 0
      cache <local>

Once a random rule is added, the error route is respected:

  # ip rule add table 0
  # ip rule del table 0
  # ip route get 192.168.0.1
  RTNETLINK answers: No route to host

Let's fix the inconsistent behaviour.

Fixes: f4530fa574df ("ipv4: Avoid overhead when no custom FIB rules are installed.")
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://patch.msgid.link/20260619212753.3367244-1-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ip_fib.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index a71a98505650..c63a3c4967ae 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -374,7 +374,7 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp,
 			     struct fib_result *res, unsigned int flags)
 {
 	struct fib_table *tb;
-	int err = -ENETUNREACH;
+	int err = -EAGAIN;
 
 	flags |= FIB_LOOKUP_NOREF;
 	if (net->ipv4.fib_has_custom_rules)
@@ -388,17 +388,16 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp,
 	if (tb)
 		err = fib_table_lookup(tb, flp, res, flags);
 
-	if (!err)
+	if (err != -EAGAIN)
 		goto out;
 
 	tb = rcu_dereference_rtnl(net->ipv4.fib_default);
 	if (tb)
 		err = fib_table_lookup(tb, flp, res, flags);
 
-out:
 	if (err == -EAGAIN)
 		err = -ENETUNREACH;
-
+out:
 	rcu_read_unlock();
 
 	return err;
-- 
cgit v1.2.3


From 22f9dbed18bcc865d750ed109c6ae2dd4cf2af55 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 13 Jun 2026 22:25:24 -0700
Subject: netfilter: x_tables.h: fix all kernel-doc warnings

- use correct names in kernel-doc comments
- add missing struct members to kernel-doc comments

Warning: include/linux/netfilter/x_tables.h:41 struct member 'targinfo' not described in 'xt_action_param'
Warning: include/linux/netfilter/x_tables.h:41 Excess struct member 'targetinfo' description in 'xt_action_param'
Warning: include/linux/netfilter/x_tables.h:90 struct member 'family' not described in 'xt_mtchk_param'
Warning: include/linux/netfilter/x_tables.h:90 struct member 'nft_compat' not described in 'xt_mtchk_param'
Warning: include/linux/netfilter/x_tables.h:101 expecting prototype for struct xt_mdtor_param. Prototype was for struct xt_mtdtor_param instead

Warning: include/linux/netfilter/x_tables.h:121 struct member 'net' not described in 'xt_tgchk_param'
Warning: include/linux/netfilter/x_tables.h:121 struct member 'table' not described in 'xt_tgchk_param'
Warning: include/linux/netfilter/x_tables.h:121 struct member 'target' not described in 'xt_tgchk_param'
Warning: include/linux/netfilter/x_tables.h:121 struct member 'targinfo' not described in 'xt_tgchk_param'
Warning: include/linux/netfilter/x_tables.h:121 struct member 'hook_mask' not described in 'xt_tgchk_param'
Warning: include/linux/netfilter/x_tables.h:121 struct member 'family' not described in 'xt_tgchk_param'
Warning: include/linux/netfilter/x_tables.h:121 struct member 'nft_compat' not described in 'xt_tgchk_param'

Warning: include/linux/netfilter/x_tables.h:345 expecting prototype for xt_recseq(). Prototype was for DECLARE_PER_CPU() instead

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/x_tables.h | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 20d70dddbe50..25062f4a0dd5 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -18,7 +18,7 @@
  * @match:	the match extension
  * @target:	the target extension
  * @matchinfo:	per-match data
- * @targetinfo:	per-target data
+ * @targinfo:	per-target data
  * @state:	pointer to hook state this packet came from
  * @fragoff:	packet is a fragment, this is the data offset
  * @thoff:	position of transport header relative to skb->data
@@ -77,7 +77,9 @@ static inline u_int8_t xt_family(const struct xt_action_param *par)
  * @match:	struct xt_match through which this function was invoked
  * @matchinfo:	per-match data
  * @hook_mask:	via which hooks the new rule is reachable
- * Other fields as above.
+ * @family:	actual NFPROTO_* through which the function is invoked
+ *		(helpful when match->family == NFPROTO_UNSPEC)
+ * @nft_compat:	running from the nft compat layer if true
  */
 struct xt_mtchk_param {
 	struct net *net;
@@ -91,8 +93,13 @@ struct xt_mtchk_param {
 };
 
 /**
- * struct xt_mdtor_param - match destructor parameters
- * Fields as above.
+ * struct xt_mtdtor_param - match destructor parameters
+ *
+ * @net:	network namespace through which the check was invoked
+ * @match:	struct xt_match through which this function was invoked
+ * @matchinfo:	per-match data
+ * @family:	actual NFPROTO_* through which the function is invoked
+ *		(helpful when match->family == NFPROTO_UNSPEC)
  */
 struct xt_mtdtor_param {
 	struct net *net;
@@ -105,10 +112,16 @@ struct xt_mtdtor_param {
  * struct xt_tgchk_param - parameters for target extensions'
  * checkentry functions
  *
+ * @net:	network namespace through which the check was invoked
+ * @table:	table the rule is tried to be inserted into
  * @entryinfo:	the family-specific rule data
  * 		(struct ipt_entry, ip6t_entry, arpt_entry, ebt_entry)
- *
- * Other fields see above.
+ * @target:	the target extension
+ * @targinfo:	per-target data
+ * @hook_mask:	via which hooks the new rule is reachable
+ * @family:	actual NFPROTO_* through which the function is invoked
+ *		(helpful when match->family == NFPROTO_UNSPEC)
+ * @nft_compat:	running from the nft compat layer if true
  */
 struct xt_tgchk_param {
 	struct net *net;
@@ -336,9 +349,9 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size);
 void xt_free_table_info(struct xt_table_info *info);
 
 /**
- * xt_recseq - recursive seqcount for netfilter use
+ * var xt_recseq - recursive seqcount for netfilter use
  *
- * Packet processing changes the seqcount only if no recursion happened
+ * Packet processing changes the seqcount only if no recursion happened.
  * get_counters() can use read_seqcount_begin()/read_seqcount_retry(),
  * because we use the normal seqcount convention :
  * Low order bit set to 1 if a writer is active.
-- 
cgit v1.2.3


From 57f940017a777aadf38b99db44cf35f727c26f4c Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 12 Jun 2026 08:03:50 +0200
Subject: netfilter: conntrack: add deprecation warnings for irc and pptp
 trackers

IRC Direct client-to-client requires plaintext.  IRC over TLS should be
preferred, making this helper ineffective.  Add a deprecation warning and
update the help text to better reflect that this is needed for the DCC
extension, not IRC itself.

PPTP is esoteric these days and it is the only helper that requires the
destroy callback in the conntrack helper API.

Removal would simplify the conntrack core.

Both helpers are IPv4 only.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_helper.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h
index 81025101f86d..c761cd8158b2 100644
--- a/include/net/netfilter/nf_conntrack_helper.h
+++ b/include/net/netfilter/nf_conntrack_helper.h
@@ -114,6 +114,10 @@ int nf_conntrack_helpers_register(struct nf_conntrack_helper *, unsigned int,
 void nf_conntrack_helpers_unregister(struct nf_conntrack_helper **,
 				     unsigned int);
 
+#define nf_conntrack_helper_deprecated(name) \
+	pr_warn("The %s conntrack helper is scheduled for removal.\n"	\
+		"Please contact the netfilter-devel mailing list if you still need this.\n", name)
+
 struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp);
 
 int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
-- 
cgit v1.2.3


From 979c13114c0bb6ab9135e2c93e00c79c412aef09 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 22 Jun 2026 21:35:14 +0200
Subject: netfilter: nf_conntrack_expect: store master_tuple in expectation

Store master conntrack tuple in the expectation since exp->master might
refer to a different conntrack when accessed from rcu read side lock
area due to typesafe rcu rules.

Fixes: 02a3231b6d82 ("netfilter: nf_conntrack_expect: store netns and zone in expectation")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_expect.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h
index be4a120d549e..c024345c9bd8 100644
--- a/include/net/netfilter/nf_conntrack_expect.h
+++ b/include/net/netfilter/nf_conntrack_expect.h
@@ -26,6 +26,7 @@ struct nf_conntrack_expect {
 	possible_net_t net;
 
 	/* We expect this tuple, with the following mask */
+	struct nf_conntrack_tuple master_tuple;
 	struct nf_conntrack_tuple tuple;
 	struct nf_conntrack_tuple_mask mask;
 
-- 
cgit v1.2.3


From 1105ef941c1a28e115d1b97f17e1c85576884100 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 24 Jun 2026 12:04:39 -0700
Subject: net: ethtool: keep rtnl_lock for ops using ethtool_op_get_link()

Breno reports following splats on mlx5:

  RTNL: assertion failed at net/core/dev.c (2241)
  WARNING: net/core/dev.c:2241 at netif_state_change+0xed/0x130, CPU#5: ethtool/1335
  RIP: 0010:netif_state_change+0xf9/0x130
  Call Trace:
    <TASK>
     __linkwatch_sync_dev+0xea/0x120
     ethtool_op_get_link+0xe/0x20
     __ethtool_get_link+0x26/0x40
     linkstate_prepare_data+0x51/0x200
     ethnl_default_doit+0x213/0x470
     genl_family_rcv_msg_doit+0xdd/0x110

Looks like I missed ethtool_op_get_link() trying to sync linkwatch,
which needs rtnl_lock. Not all drivers do this - bnxt doesn't,
it just returns the link state, so add an opt-in bit.

Reported-by: Breno Leitao <leitao@debian.org>
Fixes: 45079e00133e ("net: ethtool: optionally skip rtnl_lock on Netlink path for GET ops")
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Reviewed-by: Breno Leitao <leitao@debian.org>
Acked-by: Harshitha Ramamurthy <hramamurthy@google.com>
Link: https://patch.msgid.link/20260624190439.2521219-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/ethtool.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 1b834e2a522e..5d491a98265e 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -942,6 +942,7 @@ struct kernel_ethtool_ts_info {
 #define ETHTOOL_OP_NEEDS_RTNL_GPAUSEPARAM	BIT(5)
 #define ETHTOOL_OP_NEEDS_RTNL_SPAUSEPARAM	BIT(6)
 #define ETHTOOL_OP_NEEDS_RTNL_RSS		BIT(7)
+#define ETHTOOL_OP_NEEDS_RTNL_GLINK		BIT(8)
 
 /**
  * struct ethtool_ops - optional netdev operations
@@ -978,6 +979,7 @@ struct kernel_ethtool_ts_info {
  *	 - phylink helpers (note that phydev is currently unsupported!)
  *	 - netdev_update_features()
  *	 - netif_set_real_num_tx_queues()
+ *	 - ethtool_op_get_link() (syncs link watch under rtnl_lock)
  *
  * @get_drvinfo: Report driver/device information. Modern drivers no
  *	longer have to implement this callback. Most fields are
-- 
cgit v1.2.3


From 12c765be84d28f22deca10e775889f54bd571a85 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 24 Jun 2026 11:20:15 -0700
Subject: net: turn the rx_mode work into a generic netdev_work facility

The rx_mode update runs from a workqueue: drivers have their
ndo_set_rx_mode_async() callback executed by a single global
work item under RTNL and ops lock. This is a useful pattern.

Support multiple "events" that need to be serviced and make RX_MODE
sync the first one. Call the events "core" because later on
we will let drivers define and schedule their own.

Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20260624182018.2445732-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b67a12541eac..732506787db3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1930,8 +1930,9 @@ enum netdev_reg_state {
  *				has been enabled due to the need to listen to
  *				additional unicast addresses in a device that
  *				does not implement ndo_set_rx_mode()
- *	@rx_mode_node:		List entry for rx_mode work processing
- *	@rx_mode_tracker:	Refcount tracker for rx_mode work
+ *	@work_node:		List entry for async netdev_work processing
+ *	@work_tracker:		Refcount tracker for async netdev_work
+ *	@work_core_pending:	Core-defined pending netdev_work (NETDEV_WORK_*)
  *	@rx_mode_addr_cache:	Recycled snapshot entries for rx_mode work
  *	@rx_mode_retry_timer:	Timer that re-queues rx_mode work after failure
  *	@rx_mode_retry_count:	Number of consecutive retries already scheduled
@@ -2326,8 +2327,9 @@ struct net_device {
 	unsigned int		promiscuity;
 	unsigned int		allmulti;
 	bool			uc_promisc;
-	struct list_head	rx_mode_node;
-	netdevice_tracker	rx_mode_tracker;
+	struct list_head	work_node;
+	netdevice_tracker	work_tracker;
+	unsigned long		work_core_pending;
 	struct netdev_hw_addr_list	rx_mode_addr_cache;
 	struct timer_list	rx_mode_retry_timer;
 	unsigned int		rx_mode_retry_count;
-- 
cgit v1.2.3


From 129cdce9da9e44c52d38889e0411be9817bca114 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 24 Jun 2026 11:20:16 -0700
Subject: net: add the driver-facing netdev_work scheduling API

With an extra event mask we can easily extend the netdev work
to also service driver-defined events. For advanced drivers
this is probably not a perfect match, but it makes running
deferred work easier in simple cases.

Expose the netdev_work facility to drivers. Add helpers
to schedule work and a dedicated ndo to perform the driver-
-scheduled actions.

Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
Link: https://patch.msgid.link/20260624182018.2445732-3-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 732506787db3..9981d637f8b5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1131,6 +1131,9 @@ struct netdev_net_notifier {
  *	netdev_hw_addr_list_for_each(ha, uc). Return 0 on success or a
  *	negative errno to request a retry via the core backoff.
  *
+ * void (*ndo_work)(struct net_device *dev, unsigned long events);
+ *	Run deferred work scheduled with netdev_work_sched(@events).
+ *
  * int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
  *	This function  is called when the Media Access Control address
  *	needs to be changed. If this interface is not defined, the
@@ -1460,6 +1463,8 @@ struct net_device_ops {
 					struct net_device *dev,
 					struct netdev_hw_addr_list *uc,
 					struct netdev_hw_addr_list *mc);
+	void			(*ndo_work)(struct net_device *dev,
+					    unsigned long events);
 	int			(*ndo_set_mac_address)(struct net_device *dev,
 						       void *addr);
 	int			(*ndo_validate_addr)(struct net_device *dev);
@@ -1932,6 +1937,8 @@ enum netdev_reg_state {
  *				does not implement ndo_set_rx_mode()
  *	@work_node:		List entry for async netdev_work processing
  *	@work_tracker:		Refcount tracker for async netdev_work
+ *	@work_pending:		Driver-defined pending netdev_work, passed to
+ *				ndo_work() (see netdev_work_sched())
  *	@work_core_pending:	Core-defined pending netdev_work (NETDEV_WORK_*)
  *	@rx_mode_addr_cache:	Recycled snapshot entries for rx_mode work
  *	@rx_mode_retry_timer:	Timer that re-queues rx_mode work after failure
@@ -2329,6 +2336,7 @@ struct net_device {
 	bool			uc_promisc;
 	struct list_head	work_node;
 	netdevice_tracker	work_tracker;
+	unsigned long		work_pending;
 	unsigned long		work_core_pending;
 	struct netdev_hw_addr_list	rx_mode_addr_cache;
 	struct timer_list	rx_mode_retry_timer;
@@ -5178,6 +5186,9 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
 			   const struct pcpu_sw_netstats __percpu *netstats);
 void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s);
 
+void netdev_work_sched(struct net_device *dev, unsigned long events);
+unsigned long netdev_work_cancel(struct net_device *dev, unsigned long mask);
+
 enum {
 	NESTED_SYNC_IMM_BIT,
 	NESTED_SYNC_TODO_BIT,
-- 
cgit v1.2.3