summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Abeni <pabeni@redhat.com>2026-07-01 18:56:27 +0200
committerPaolo Abeni <pabeni@redhat.com>2026-07-01 18:56:27 +0200
commit0469d460a598d03fc85ebd97f99640e6c579e2a2 (patch)
tree60db252156e4f70615071fab658b959095a5f9dc
parenta225f8c20712713406ae47024b8df42deacddd4a (diff)
parentc3716a3c43465641ded6e01c0b187de42e87a80d (diff)
Merge tag 'nf-26-06-30' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf
Florian Westphal says: ==================== netfilter: updates for net The following patchset contains Netfilter fixes for *net*. Due to bug volume the plan is to make a second *net* pull request this Friday. 1) Zero nf_conntrack_expect at allocation to prevent uninitialized data leaks to userspace. Add missing exp->dir initialization. 2) Prevent out-of-bounds writes in nft_set_pipapo caused by inconsistent clones during allocation failures. Fail operations if the clone enters an error state. This was a day-0 bug. 3) Fix use-after-free race between ipset dump and array resizing. Protect array pointer access with rcu_read_lock(). From Xiang Mei. Bug existed since v4.20. 4) Validate skb_dst() exists before access in nf_conntrack_sip. This Prevent crash when called from tc ingress or openvswitch. From Pablo Neira Ayuso. Bug added in 4.3 when ovs gained support for conntrack helpers. 5) Cap the maximum number of expectations to NF_CT_EXPECT_MAX_CNT during userspace helper policy updates. Also from Pablo. 6) Prevent NULL pointer dereference in nft_fib on netdev egress hooks. Add nft_fib_netdev_validate() to restrict fib expressions to appropriate netdev hooks. Restrict nft_fib_validate() to IPv4, IPv6, and INET protocols. From Theodor Arsenij Larionov-Trichkine. Bug was exposed in v5.16 when egress hooks got added. 7) Restrict nfnetlink_queue writes to network headers. Validate IP/IPv6 header length and disable extension headers or IP option modifications. Disable bridge modification for now, its unlikely anyone is using this. 8) Restrict arbitrary writes to link-layer and network headers in nftables. Prevent link-layer modifications from spilling into network headers. Prevent writes to IP version and length fields. 9) Restrict L3 checksum update offset to IPv4. Else csum offset can be used to munge arbitrary header offsets, rendering the previous change moot. These three patches are follow-ups to a 7.1 change that disabled header rewrite ability in unprivileged network namespaces. unprivileged netns support is not yet enabled again here. netfilter pull request nf-26-06-30 * tag 'nf-26-06-30' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf: netfilter: nftables: restrict checkum update offset netfilter: nftables: restrict linklayer and network header writes netfilter: nfnetlink_queue: restrict writes to network header netfilter: nft_fib: reject fib expression on the netdev egress hook netfilter: nfnetlink_cthelper: cap to maximum number of expectation per master netfilter: nf_conntrack_sip: validate skb_dst() before accessing it netfilter: ipset: fix race between dump and ip_set_list resize netfilter: nft_set_pipapo: don't leak bad clone into future transaction netfilter: nf_conntrack_expect: zero at allocation time ==================== Link: https://patch.msgid.link/20260630045243.2657-1-fw@strlen.de Signed-off-by: Paolo Abeni <pabeni@redhat.com>
-rw-r--r--net/netfilter/ipset/ip_set_core.c8
-rw-r--r--net/netfilter/nf_conntrack_expect.c3
-rw-r--r--net/netfilter/nf_conntrack_netlink.c11
-rw-r--r--net/netfilter/nf_conntrack_sip.c7
-rw-r--r--net/netfilter/nfnetlink_cthelper.c2
-rw-r--r--net/netfilter/nfnetlink_queue.c170
-rw-r--r--net/netfilter/nft_fib.c9
-rw-r--r--net/netfilter/nft_fib_netdev.c29
-rw-r--r--net/netfilter/nft_payload.c270
-rw-r--r--net/netfilter/nft_set_pipapo.c34
-rw-r--r--net/netfilter/nft_set_pipapo.h8
11 files changed, 531 insertions, 20 deletions
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index a531b654b8d9..6cfad152d7d1 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1480,7 +1480,11 @@ ip_set_dump_done(struct netlink_callback *cb)
struct ip_set_net *inst =
(struct ip_set_net *)cb->args[IPSET_CB_NET];
ip_set_id_t index = (ip_set_id_t)cb->args[IPSET_CB_INDEX];
- struct ip_set *set = ip_set_ref_netlink(inst, index);
+ struct ip_set *set;
+
+ rcu_read_lock();
+ set = ip_set_ref_netlink(inst, index);
+ rcu_read_unlock();
if (set->variant->uref)
set->variant->uref(set, cb, false);
@@ -1686,7 +1690,9 @@ next_set:
release_refcount:
/* If there was an error or set is done, release set */
if (ret || !cb->args[IPSET_CB_ARG0]) {
+ rcu_read_lock();
set = ip_set_ref_netlink(inst, index);
+ rcu_read_unlock();
if (set->variant->uref)
set->variant->uref(set, cb, false);
pr_debug("release set %s\n", set->name);
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 38630c5e006f..7ae68d60586a 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -306,7 +306,7 @@ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me)
{
struct nf_conntrack_expect *new;
- new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC);
+ new = kmem_cache_zalloc(nf_ct_expect_cachep, GFP_ATOMIC);
if (!new)
return NULL;
@@ -391,6 +391,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class,
#if IS_ENABLED(CONFIG_NF_NAT)
memset(&exp->saved_addr, 0, sizeof(exp->saved_addr));
memset(&exp->saved_proto, 0, sizeof(exp->saved_proto));
+ exp->dir = 0;
#endif
}
EXPORT_SYMBOL_GPL(nf_ct_expect_init);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 4217715d42dc..31cbb1b55b9e 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -3549,8 +3549,6 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct,
if (cda[CTA_EXPECT_FLAGS]) {
exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS]));
exp->flags &= ~NF_CT_EXPECT_USERSPACE;
- } else {
- exp->flags = 0;
}
if (cda[CTA_EXPECT_FN]) {
const char *name = nla_data(cda[CTA_EXPECT_FN]);
@@ -3562,8 +3560,7 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct,
goto err_out;
}
exp->expectfn = expfn->expectfn;
- } else
- exp->expectfn = NULL;
+ }
exp->class = class;
exp->master = ct;
@@ -3583,12 +3580,6 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct,
exp, nf_ct_l3num(ct));
if (err < 0)
goto err_out;
-#if IS_ENABLED(CONFIG_NF_NAT)
- } else {
- memset(&exp->saved_addr, 0, sizeof(exp->saved_addr));
- memset(&exp->saved_proto, 0, sizeof(exp->saved_proto));
- exp->dir = 0;
-#endif
}
return exp;
err_out:
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 5ec3a4a4bbd7..f3f90a866338 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -956,7 +956,6 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
return NF_ACCEPT;
saddr = &ct->tuplehash[!dir].tuple.src.u3;
} else if (sip_external_media) {
- struct net_device *dev = skb_dst(skb)->dev;
struct dst_entry *dst = NULL;
struct flowi fl;
@@ -978,7 +977,11 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, unsigned int protoff,
* through the same interface as the signalling peer.
*/
if (dst) {
- bool external_media = (dst->dev == dev);
+ const struct dst_entry *this_dst = skb_dst(skb);
+ bool external_media = false;
+
+ if (this_dst && dst->dev == this_dst->dev)
+ external_media = true;
dst_release(dst);
if (external_media)
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index f1460b683d7a..2cbcca9110db 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -163,6 +163,8 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy,
tb[NFCTH_POLICY_NAME], NF_CT_HELPER_NAME_LEN);
expect_policy->max_expected =
ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX]));
+ if (!expect_policy->max_expected)
+ expect_policy->max_expected = NF_CT_EXPECT_MAX_CNT;
if (expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT)
return -EINVAL;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 80ca077b81bd..35d4c6c628ff 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1184,6 +1184,173 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
return err;
}
+static bool nfqnl_validate_ipopts(const struct iphdr *iph_new,
+ const struct nf_queue_entry *e)
+{
+ const struct iphdr *iph_orig = ip_hdr(e->skb);
+ unsigned int ihl = iph_new->ihl * 4;
+
+ if (iph_new->ihl != iph_orig->ihl)
+ return false;
+ if (ihl == sizeof(*iph_orig))
+ return true;
+
+ return memcmp(iph_new + 1, ip_hdr(e->skb) + 1, ihl - sizeof(*iph_orig)) == 0;
+}
+
+static bool nfqnl_validate_ip4(const struct iphdr *iph, unsigned int data_len,
+ const struct nf_queue_entry *e)
+{
+ unsigned int ihl;
+
+ if (data_len < sizeof(*iph))
+ return false;
+
+ ihl = iph->ihl * 4u;
+ if (ihl < sizeof(*iph) || data_len < ihl)
+ return false;
+
+ if (iph->version != 4 ||
+ ((iph->frag_off ^ ip_hdr(e->skb)->frag_off) & ~htons(IP_DF)) != 0)
+ return false;
+
+ /* BIG TCP won't work; netlink attr len is u16 */
+ if (ntohs(iph->tot_len) != data_len)
+ return false;
+
+ /* support for ipopts mangling would require
+ * recompile + skb transport header update.
+ */
+ return nfqnl_validate_ipopts(iph, e);
+}
+
+static bool nfqnl_validate_one_exthdr(const u8 *data,
+ unsigned int data_len,
+ const struct nf_queue_entry *e,
+ int start, int hdrlen)
+{
+ u16 octets;
+
+ if (data_len < hdrlen || hdrlen < 2)
+ return false;
+
+ while (hdrlen > 0) {
+ if (data_len < sizeof(octets))
+ return false;
+ data_len -= sizeof(octets);
+
+ if (skb_copy_bits(e->skb, start, &octets, sizeof(octets)))
+ return false;
+
+ if (hdrlen < sizeof(octets))
+ return false;
+
+ hdrlen -= sizeof(octets);
+ if (memcmp(data, &octets, sizeof(octets)))
+ return false;
+
+ start += sizeof(octets);
+ data += sizeof(octets);
+ }
+
+ return true;
+}
+
+static bool nfqnl_validate_exthdr(const struct ipv6hdr *ip6_new,
+ unsigned int data_len,
+ const struct nf_queue_entry *e)
+{
+ const struct ipv6hdr *ip6_orig = ipv6_hdr(e->skb);
+ int exthdr_cnt = 0, start = sizeof(*ip6_orig);
+ const u8 *data = (const u8 *)ip6_new;
+ u8 orig_nexthdr = ip6_orig->nexthdr;
+ u8 new_nexthdr = ip6_new->nexthdr;
+
+ if (new_nexthdr != orig_nexthdr)
+ return false;
+
+ data += sizeof(*ip6_new);
+ data_len -= sizeof(*ip6_new);
+
+ while (ipv6_ext_hdr(orig_nexthdr)) {
+ const struct ipv6_opt_hdr *hp;
+ struct ipv6_opt_hdr _hdr;
+ int hdrlen;
+
+ if (orig_nexthdr == NEXTHDR_NONE)
+ return true;
+
+ if (unlikely(exthdr_cnt++ >= IP6_MAX_EXT_HDRS_CNT))
+ return false;
+
+ hp = skb_header_pointer(e->skb, start, sizeof(_hdr), &_hdr);
+ if (!hp)
+ return false;
+
+ switch (orig_nexthdr) {
+ case NEXTHDR_FRAGMENT:
+ hdrlen = sizeof(struct frag_hdr);
+ break;
+ case NEXTHDR_AUTH:
+ hdrlen = ipv6_authlen(hp);
+ break;
+ default:
+ hdrlen = ipv6_optlen(hp);
+ break;
+ }
+
+ if (!nfqnl_validate_one_exthdr(data, data_len, e,
+ start, hdrlen))
+ return false;
+
+ orig_nexthdr = hp->nexthdr;
+ hp = (const void *)data;
+ new_nexthdr = hp->nexthdr;
+
+ if (new_nexthdr != orig_nexthdr)
+ return false;
+
+ data_len -= hdrlen;
+ start += hdrlen;
+ data += hdrlen;
+ }
+
+ return true;
+}
+
+static bool nfqnl_validate_ip6(const struct ipv6hdr *ip6, unsigned int data_len,
+ const struct nf_queue_entry *e)
+{
+ if (data_len < sizeof(*ip6))
+ return false;
+
+ /* BIG TCP/jumbograms won't work; netlink attr len is u16 */
+ if (ntohs(ip6->payload_len) != data_len - sizeof(*ip6))
+ return false;
+
+ if (ip6->version != 6)
+ return false;
+
+ return nfqnl_validate_exthdr(ip6, data_len, e);
+}
+
+static bool nfqnl_validate_write(const void *data, unsigned int data_len,
+ const struct nf_queue_entry *e)
+{
+ switch (e->state.pf) {
+ case NFPROTO_IPV4:
+ return nfqnl_validate_ip4(data, data_len, e);
+ case NFPROTO_IPV6:
+ return nfqnl_validate_ip6(data, data_len, e) &&
+ !(IP6CB(e->skb)->flags & IP6SKB_JUMBOGRAM);
+ case NFPROTO_BRIDGE:
+ /* No write support. Bridge is dubious: userspace doesn't even see L2 header */
+ return false;
+ }
+
+ return false;
+}
+
static int
nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int diff)
{
@@ -1192,6 +1359,9 @@ nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int di
if (e->state.net->user_ns != &init_user_ns)
return -EPERM;
+ if (!nfqnl_validate_write(data, data_len, e))
+ return -EINVAL;
+
if (diff < 0) {
unsigned int min_len = skb_transport_offset(e->skb);
diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c
index e048f05694cd..89555380f1c5 100644
--- a/net/netfilter/nft_fib.c
+++ b/net/netfilter/nft_fib.c
@@ -31,6 +31,15 @@ int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr)
const struct nft_fib *priv = nft_expr_priv(expr);
unsigned int hooks;
+ switch (ctx->family) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ case NFPROTO_INET:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
switch (priv->result) {
case NFT_FIB_RESULT_OIF:
case NFT_FIB_RESULT_OIFNAME:
diff --git a/net/netfilter/nft_fib_netdev.c b/net/netfilter/nft_fib_netdev.c
index 3f3478abd845..5774a7544027 100644
--- a/net/netfilter/nft_fib_netdev.c
+++ b/net/netfilter/nft_fib_netdev.c
@@ -50,6 +50,33 @@ static void nft_fib_netdev_eval(const struct nft_expr *expr,
regs->verdict.code = NFT_BREAK;
}
+static int nft_fib_netdev_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_fib *priv = nft_expr_priv(expr);
+ unsigned int hooks;
+
+ switch (priv->result) {
+ case NFT_FIB_RESULT_OIF:
+ case NFT_FIB_RESULT_OIFNAME:
+ hooks = (1 << NF_NETDEV_INGRESS);
+ break;
+ case NFT_FIB_RESULT_ADDRTYPE:
+ if (priv->flags & NFTA_FIB_F_IIF)
+ hooks = (1 << NF_NETDEV_INGRESS);
+ else if (priv->flags & NFTA_FIB_F_OIF)
+ hooks = (1 << NF_NETDEV_EGRESS);
+ else
+ hooks = (1 << NF_NETDEV_INGRESS) |
+ (1 << NF_NETDEV_EGRESS);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return nft_chain_validate_hooks(ctx->chain, hooks);
+}
+
static struct nft_expr_type nft_fib_netdev_type;
static const struct nft_expr_ops nft_fib_netdev_ops = {
.type = &nft_fib_netdev_type,
@@ -57,7 +84,7 @@ static const struct nft_expr_ops nft_fib_netdev_ops = {
.eval = nft_fib_netdev_eval,
.init = nft_fib_init,
.dump = nft_fib_dump,
- .validate = nft_fib_validate,
+ .validate = nft_fib_netdev_validate,
};
static struct nft_expr_type nft_fib_netdev_type __read_mostly = {
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 345eff140d56..391539a1ceaa 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -834,6 +834,249 @@ nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u16 offset, u8 len,
return true;
}
+/* Ingress is very early, before l3 protocol handlers.
+ * There should be no in-tree code that trusts l3/l4 headers
+ * between ingress and NF_INET_PRE_ROUTING hooks.
+ */
+static bool nft_in_ingress(const struct nf_hook_state *s)
+{
+ return s->pf == NFPROTO_NETDEV && s->hook == NF_NETDEV_INGRESS;
+}
+
+static bool nft_nh_write_ok_ip4(const struct nft_pktinfo *pkt,
+ const struct nft_payload_set *priv,
+ const u32 *src)
+{
+ unsigned int offset = priv->offset + skb_network_offset(pkt->skb);
+ const u8 *new_octets = (const u8 *)src;
+ u8 old_octet;
+
+ switch (priv->offset) {
+ case 0: /* csum fixups does expand dscp/tos store to 2 bytes.
+ * make sure ihl/version remain unchanged.
+ */
+ if (skb_copy_bits(pkt->skb, offset, &old_octet, sizeof(old_octet)))
+ return false;
+
+ return priv->len == 2 &&
+ *new_octets == old_octet;
+ case offsetof(struct iphdr, tos):
+ return priv->len == 1;
+ case offsetof(struct iphdr, id):
+ return priv->len == 2;
+ case offsetof(struct iphdr, ttl):
+ if (priv->len == 1)
+ return true;
+
+ if (priv->len != 2)
+ return false;
+
+ /* same, csum fixup does expand ttl store to two bytes.
+ * check protocol is not altered.
+ */
+ if (skb_copy_bits(pkt->skb, offset + 1, &old_octet, sizeof(old_octet)))
+ return false;
+
+ return new_octets[1] == old_octet;
+ case offsetof(struct iphdr, check):
+ return priv->len <= 2 + 4 + 4;
+ case offsetof(struct iphdr, saddr):
+ return priv->len <= 4 + 4;
+ case offsetof(struct iphdr, daddr):
+ return priv->len <= 4;
+ }
+
+ return false;
+}
+
+static bool nft_nh_write_ok_ip6(const struct nft_pktinfo *pkt,
+ const struct nft_payload_set *priv,
+ const u32 *src)
+{
+ const struct ipv6hdr *ih = (const void *)src;
+
+ switch (priv->offset) {
+ case 0: /* store to dscp must not alter ip6 version */
+ return priv->len <= 4 && ih->version == 6;
+ case 2:
+ return priv->len <= 2;
+ case offsetof(struct ipv6hdr, hop_limit):
+ return priv->len == 1;
+ case offsetof(struct ipv6hdr, saddr):
+ return priv->len <= 16 + 16;
+ case offsetof(struct ipv6hdr, daddr):
+ return priv->len <= 16;
+ }
+
+ return false;
+}
+
+static bool nft_nh_write_ok_arp(const struct nft_payload_set *priv)
+{
+ /* Variable size for standard ethernet arp */
+ const unsigned int eth_ip = 2 * (ETH_ALEN + 4);
+ unsigned int offset = priv->offset;
+
+ switch (offset) {
+ case offsetof(struct arphdr, ar_op):
+ return priv->len == 2;
+ default:
+ break;
+ }
+
+ /* permit writes post fixed arp header size. offset + len are
+ * checked vs skb size via skb_ensure_writable.
+ */
+ return offset >= sizeof(struct arphdr) && priv->len <= eth_ip;
+}
+
+static bool nft_nh_write_ok_netdev(const struct nft_pktinfo *pkt,
+ const struct nft_payload_set *priv,
+ const u32 *src)
+{
+#ifdef CONFIG_NF_TABLES_NETDEV
+ switch (pkt->skb->protocol) {
+ case htons(ETH_P_ARP):
+ return nft_nh_write_ok_arp(priv);
+ case htons(ETH_P_IP):
+ return nft_nh_write_ok_ip4(pkt, priv, src);
+ case htons(ETH_P_IPV6):
+ return nft_nh_write_ok_ip6(pkt, priv, src);
+ }
+#endif
+ /* default to false for now, relax later in case we have
+ * use-cases that need inner header manipulation for
+ * encapsulated traffic like vlan or PPPoE.
+ */
+ return false;
+}
+
+static bool nft_nh_write_ok_bridge(const struct nft_pktinfo *pkt,
+ const struct nft_payload_set *priv,
+ const u32 *src)
+{
+#if IS_ENABLED(CONFIG_NF_TABLES_BRIDGE)
+ switch (pkt->ethertype) {
+ case htons(ETH_P_ARP):
+ return nft_nh_write_ok_arp(priv);
+ case htons(ETH_P_IP):
+ return nft_nh_write_ok_ip4(pkt, priv, src);
+ case htons(ETH_P_IPV6):
+ return nft_nh_write_ok_ip6(pkt, priv, src);
+ }
+#endif
+ /* see nft_nh_write_ok_netdev: default to false */
+ return false;
+}
+
+static bool nft_nh_write_ok(const struct nft_pktinfo *pkt,
+ const struct nft_payload_set *priv,
+ const u32 *src)
+{
+ switch (pkt->state->pf) {
+ case NFPROTO_ARP:
+ return nft_nh_write_ok_arp(priv);
+ case NFPROTO_BRIDGE:
+ return nft_nh_write_ok_bridge(pkt, priv, src);
+ case NFPROTO_IPV4:
+ return nft_nh_write_ok_ip4(pkt, priv, src);
+ case NFPROTO_IPV6:
+ return nft_nh_write_ok_ip6(pkt, priv, src);
+ case NFPROTO_NETDEV:
+ if (pkt->state->hook == NF_NETDEV_INGRESS)
+ return true;
+ return nft_nh_write_ok_netdev(pkt, priv, src);
+ }
+
+ return false;
+}
+
+/* check linklayer modifications don't spill into network header. */
+static bool nft_ll_write_ok(const struct nft_pktinfo *pkt, int offset)
+{
+ if (nft_in_ingress(pkt->state))
+ return true;
+
+ return offset <= skb_network_offset(pkt->skb);
+}
+
+static bool nft_payload_validate_inet_csum_offset(const struct nft_ctx *ctx,
+ const struct nft_payload_set *priv)
+{
+ switch (priv->base) {
+ case NFT_PAYLOAD_LL_HEADER:
+ break;
+ case NFT_PAYLOAD_NETWORK_HEADER:
+ if (ctx->family == NFPROTO_IPV4) {
+ if (offsetof(struct iphdr, check) == priv->csum_offset)
+ return true;
+
+ return false;
+ }
+ return true; /* run time validation required */
+ case NFT_PAYLOAD_TRANSPORT_HEADER:
+ if (priv->csum_flags) /* makes no sense, asks for "re-update" of L4 checksum */
+ return false;
+
+ /* no further check here; offset can't be negative so bogus
+ * offsets can corrupt L4 or payload but not l3 headers.
+ * We already allow arbitrary l4/inner payload writes.
+ */
+ return true;
+ case NFT_PAYLOAD_INNER_HEADER:
+ return true;
+ case NFT_PAYLOAD_TUN_HEADER:
+ break;
+ }
+
+ return false;
+}
+
+/* do not allow arbitrary network header mangling via bogus csum_off.
+ * We only support ipv4. Only NFPROTO_IPV4 can be checked from control
+ * plane.
+ */
+static bool nft_payload_csum_nh_write_ok(const struct nft_payload_set *priv,
+ const struct nft_pktinfo *pkt)
+{
+ switch (pkt->state->pf) {
+ case NFPROTO_IPV4:
+ /* Warning: NFPROTO_INET was not checked; we can't return true here. */
+ return priv->csum_offset == offsetof(struct iphdr, check);
+ case NFPROTO_IPV6:
+ return false;
+ case NFPROTO_BRIDGE:
+ return pkt->ethertype == htons(ETH_P_IP) &&
+ priv->csum_offset == offsetof(struct iphdr, check);
+ case NFPROTO_NETDEV:
+ return pkt->skb->protocol == htons(ETH_P_IP) &&
+ priv->csum_offset == offsetof(struct iphdr, check);
+ }
+
+ return false;
+}
+
+static bool nft_payload_csum_write_ok(const struct nft_pktinfo *pkt,
+ const struct nft_payload_set *priv)
+{
+ switch (priv->base) {
+ case NFT_PAYLOAD_LL_HEADER:
+ break;
+ case NFT_PAYLOAD_NETWORK_HEADER:
+ return nft_payload_csum_nh_write_ok(priv, pkt);
+ case NFT_PAYLOAD_TRANSPORT_HEADER:
+ case NFT_PAYLOAD_INNER_HEADER:
+ /* neither offsets are validated, offsets cannot be
+ * negative so real l3 headers cannot be mangled.
+ */
+ return true;
+ case NFT_PAYLOAD_TUN_HEADER:
+ break;
+ }
+
+ return false;
+}
+
static void nft_payload_set_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -861,8 +1104,12 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
}
offset = skb_mac_header(skb) - skb->data - vlan_hlen;
+ if (!nft_ll_write_ok(pkt, priv->len + priv->offset + offset))
+ goto err;
break;
case NFT_PAYLOAD_NETWORK_HEADER:
+ if (!nft_nh_write_ok(pkt, priv, src))
+ goto err;
offset = skb_network_offset(skb);
break;
case NFT_PAYLOAD_TRANSPORT_HEADER:
@@ -894,6 +1141,7 @@ static void nft_payload_set_eval(const struct nft_expr *expr,
tsum = csum_partial(src, priv->len, 0);
if (priv->csum_type == NFT_PAYLOAD_CSUM_INET &&
+ nft_payload_csum_write_ok(pkt, priv) &&
nft_payload_csum_inet(skb, src, fsum, tsum, csum_offset))
goto err;
@@ -960,7 +1208,26 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
switch (csum_type) {
case NFT_PAYLOAD_CSUM_NONE:
+ if (priv->csum_offset) /* nonsensical */
+ return -EINVAL;
+
+ if (priv->csum_flags == 0)
+ break;
+
+ /* Userspace requests L4 checksum update, e.g.:
+ * - IPv6 stateless NAT (no l3 csum)
+ * - transport header mangling
+ * - inner data mangling
+ */
+ if (priv->base == NFT_PAYLOAD_NETWORK_HEADER ||
+ priv->base == NFT_PAYLOAD_TRANSPORT_HEADER ||
+ priv->base == NFT_PAYLOAD_INNER_HEADER)
+ break;
+
+ return -EINVAL;
case NFT_PAYLOAD_CSUM_INET:
+ if (!nft_payload_validate_inet_csum_offset(ctx, priv))
+ return -EINVAL;
break;
case NFT_PAYLOAD_CSUM_SCTP:
if (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER)
@@ -968,6 +1235,9 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
if (priv->csum_offset != offsetof(struct sctphdr, checksum))
return -EINVAL;
+
+ if (priv->csum_flags)
+ return -EINVAL;
break;
default:
return -EOPNOTSUPP;
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 706c78853f24..978bb0c01106 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -342,6 +342,8 @@
#include "nft_set_pipapo_avx2.h"
#include "nft_set_pipapo.h"
+static void nft_pipapo_abort(const struct nft_set *set);
+
/**
* pipapo_refill() - For each set bit, set bits from selected mapping table item
* @map: Bitmap to be scanned for set bits
@@ -1296,7 +1298,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
const u8 *start_p, *end_p;
int i, bsize_max, err = 0;
- if (!m)
+ if (!m || m->state == NFT_PIPAPO_CLONE_ERR)
return -ENOMEM;
if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END))
@@ -1367,8 +1369,10 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
else
ret = pipapo_expand(f, start, end, f->groups * f->bb);
- if (ret < 0)
- return ret;
+ if (ret < 0) {
+ err = ret;
+ goto abort;
+ }
if (f->bsize > bsize_max)
bsize_max = f->bsize;
@@ -1384,7 +1388,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
err = pipapo_realloc_scratch(m, bsize_max);
if (err)
- return err;
+ goto abort;
m->bsize_max = bsize_max;
} else {
@@ -1396,7 +1400,26 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
pipapo_map(m, rulemap, e);
+ m->state = NFT_PIPAPO_CLONE_MOD;
return 0;
+abort:
+ DEBUG_NET_WARN_ON_ONCE(m->state == NFT_PIPAPO_CLONE_ERR);
+
+ /* Two rollback cases:
+ * 1) no previous changes. nft_pipapo_abort is not
+ * guaranteed to be invoked (there might be no further
+ * add/delete requests coming after this).
+ *
+ * 2) we had previous changes: there are transaction
+ * records pointing to this set. Leave the rollback to
+ * the transaction handling.
+ */
+ if (m->state == NFT_PIPAPO_CLONE_NEW)
+ nft_pipapo_abort(set); /* releases m */
+ else
+ m->state = NFT_PIPAPO_CLONE_ERR;
+
+ return err;
}
/**
@@ -1473,6 +1496,7 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
dst++;
}
+ new->state = NFT_PIPAPO_CLONE_NEW;
return new;
out_mt:
@@ -1896,7 +1920,7 @@ nft_pipapo_deactivate(const struct net *net, const struct nft_set *set,
/* removal must occur on priv->clone, if we are low on memory
* we have no choice and must fail the removal request.
*/
- if (!m)
+ if (!m || m->state == NFT_PIPAPO_CLONE_ERR)
return NULL;
e = pipapo_get(m, (const u8 *)elem->key.val.data,
diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
index b82abb03576e..a19e980d06ef 100644
--- a/net/netfilter/nft_set_pipapo.h
+++ b/net/netfilter/nft_set_pipapo.h
@@ -131,9 +131,16 @@ struct nft_pipapo_scratch {
unsigned long __map[];
};
+enum nft_pipapo_clone_state {
+ NFT_PIPAPO_CLONE_NEW,
+ NFT_PIPAPO_CLONE_MOD,
+ NFT_PIPAPO_CLONE_ERR,
+};
+
/**
* struct nft_pipapo_match - Data used for lookup and matching
* @field_count: Amount of fields in set
+ * @state: add/delete state; used from control plane
* @bsize_max: Maximum lookup table bucket size of all fields, in longs
* @scratch: Preallocated per-CPU maps for partial matching results
* @rcu: Matching data is swapped on commits
@@ -141,6 +148,7 @@ struct nft_pipapo_scratch {
*/
struct nft_pipapo_match {
u8 field_count;
+ enum nft_pipapo_clone_state state:8;
unsigned int bsize_max;
struct nft_pipapo_scratch * __percpu *scratch;
struct rcu_head rcu;