diff options
| author | Jakub Kicinski <kuba@kernel.org> | 2026-06-23 19:07:42 -0700 |
|---|---|---|
| committer | Jakub Kicinski <kuba@kernel.org> | 2026-06-23 19:07:42 -0700 |
| commit | 96c035083e2b6dc5e535cf9de2f450da7b82e3ee (patch) | |
| tree | 341ff1fd438a0464300f9d89121a5702d48ac9c4 | |
| parent | e9deb406c10f5a73bcfd62f42ca1187b220bc188 (diff) | |
| parent | 33a971d549d82b06c07ce6ed10c33089f80fa944 (diff) | |
Merge branch 'drop-skb-metadata-before-lwt-encapsulation'
Jakub Sitnicki says:
====================
Drop skb metadata before LWT encapsulation
See description for patch 1.
====================
Link: https://patch.msgid.link/20260619-bpf-lwt-drop-skb-metadata-v3-0-71d6a33ab76b@cloudflare.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
| -rw-r--r-- | net/core/lwtunnel.c | 6 | ||||
| -rw-r--r-- | tools/testing/selftests/bpf/config | 3 | ||||
| -rw-r--r-- | tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c | 175 | ||||
| -rw-r--r-- | tools/testing/selftests/bpf/progs/test_xdp_meta.c | 123 |
4 files changed, 255 insertions, 52 deletions
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index f9d76d85d04f..b01a395d9a96 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -350,6 +350,8 @@ int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->output)) { + /* Encap pushes outer headers over the metadata; drop it. */ + skb_metadata_clear(skb); dev_xmit_recursion_inc(); ret = ops->output(net, sk, skb); dev_xmit_recursion_dec(); @@ -404,6 +406,8 @@ int lwtunnel_xmit(struct sk_buff *skb) rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->xmit)) { + /* Encap pushes outer headers over the metadata; drop it. */ + skb_metadata_clear(skb); dev_xmit_recursion_inc(); ret = ops->xmit(skb); dev_xmit_recursion_dec(); @@ -455,6 +459,8 @@ int lwtunnel_input(struct sk_buff *skb) rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->input)) { + /* Encap pushes outer headers over the metadata; drop it. */ + skb_metadata_clear(skb); dev_xmit_recursion_inc(); ret = ops->input(skb); dev_xmit_recursion_dec(); diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 24855381290d..aebc5082fd77 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -45,13 +45,16 @@ CONFIG_IPV6=y CONFIG_IPV6_FOU=y CONFIG_IPV6_FOU_TUNNEL=y CONFIG_IPV6_GRE=y +CONFIG_IPV6_IOAM6_LWTUNNEL=y CONFIG_IPV6_SEG6_BPF=y +CONFIG_IPV6_SEG6_LWTUNNEL=y CONFIG_IPV6_SIT=y CONFIG_IPV6_TUNNEL=y CONFIG_KEYS=y CONFIG_LIRC=y CONFIG_LIVEPATCH=y CONFIG_LWTUNNEL=y +CONFIG_LWTUNNEL_BPF=y CONFIG_MODULE_SIG=y CONFIG_MODULE_SRCVERSION_ALL=y CONFIG_MODULE_UNLOAD=y diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c index 26159e0499c7..448807676176 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_context_test_run.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include <test_progs.h> #include <network_helpers.h> +#include <linux/ipv6.h> +#include <arpa/inet.h> #include "test_xdp_context_test_run.skel.h" #include "test_xdp_meta.skel.h" @@ -8,9 +10,12 @@ #define TX_NAME "veth1" #define TX_NETNS "xdp_context_tx" #define RX_NETNS "xdp_context_rx" +#define RX_MAC "02:00:00:00:00:01" +#define TX_MAC "02:00:00:00:00:02" #define TAP_NAME "tap0" #define DUMMY_NAME "dum0" #define TAP_NETNS "xdp_context_tuntap" +#define LWT_NETNS "xdp_context_lwt" #define TEST_PAYLOAD_LEN 32 static const __u8 test_payload[TEST_PAYLOAD_LEN] = { @@ -187,6 +192,42 @@ static int write_test_packet(int tap_fd) return 0; } +/* Inject Ethernet+IPv6+UDP frame into TAP */ +static int write_test_packet_udp(int tap_fd) +{ + __u8 pkt[sizeof(struct ethhdr) + sizeof(struct ipv6hdr) + + sizeof(struct udphdr) + TEST_PAYLOAD_LEN] = {}; + struct ethhdr *eth = (void *)pkt; + struct ipv6hdr *ip6 = (void *)(eth + 1); + struct udphdr *udp = (void *)(ip6 + 1); + __u8 *payload = (void *)(udp + 1); + const __u8 tap_mac[ETH_ALEN] = { 0x02, 0, 0, 0, 0, 0x01 }; + int n; + + memcpy(eth->h_dest, tap_mac, ETH_ALEN); + eth->h_proto = htons(ETH_P_IPV6); + + ip6->version = 6; + ip6->hop_limit = 64; + ip6->nexthdr = IPPROTO_UDP; + ip6->payload_len = htons(sizeof(*udp) + TEST_PAYLOAD_LEN); + inet_pton(AF_INET6, "fd00::2", &ip6->saddr); + inet_pton(AF_INET6, "fd00:1::1", &ip6->daddr); + + udp->source = htons(42); + udp->dest = htons(42); + udp->len = htons(sizeof(*udp) + TEST_PAYLOAD_LEN); + /* UDP checksum is not validated on the forwarding path. */ + + memcpy(payload, test_payload, TEST_PAYLOAD_LEN); + + n = write(tap_fd, pkt, sizeof(pkt)); + if (!ASSERT_EQ(n, sizeof(pkt), "write frame")) + return -1; + + return 0; +} + static void dump_err_stream(const struct bpf_program *prog) { char buf[512]; @@ -518,3 +559,137 @@ void test_xdp_context_tuntap(void) test_xdp_meta__destroy(skel); } + +/* + * Test topology: + * + * tap0 fd00::1 + * RX: injected IPv6 UDP frame, XDP ingress sets metadata + * fwd: encap route prepends outer header(s) + * TX: TC egress validates metadata + * + * A routable IPv6 UDP frame is written into the tap fd, so it enters the RX + * path where XDP stores metadata. Routing then forwards it back out the same + * tap through an encapsulating route that prepends outer header(s). The TC + * egress program checks that the pushed header did not silently corrupt + * metadata. + */ +#define LWT_PIN_PATH "/sys/fs/bpf/xdp_context_lwt_xmit" + +enum lwt_encap_type { + LWT_ENCAP_BPF, + LWT_ENCAP_MPLS, + LWT_ENCAP_SEG6, + LWT_ENCAP_IOAM6, +}; + +static void test_lwt_encap(struct test_xdp_meta *skel, + enum lwt_encap_type type) +{ + LIBBPF_OPTS(bpf_tc_hook, tc_hook, .attach_point = BPF_TC_EGRESS); + LIBBPF_OPTS(bpf_tc_opts, tc_opts, .handle = 1, .priority = 1); + struct bpf_program *lwt_prog = NULL; + struct netns_obj *ns = NULL; + const char *encap; + bool pinned = false; + int tap_ifindex; + int tap_fd = -1; + int ret; + + skel->bss->test_pass = false; + + switch (type) { + case LWT_ENCAP_BPF: + encap = "encap bpf xmit pinned " LWT_PIN_PATH " via fd00::2"; + lwt_prog = skel->progs.dummy_lwt_xmit; + break; + case LWT_ENCAP_MPLS: + encap = "encap mpls 100 via inet6 fd00::2"; + break; + case LWT_ENCAP_SEG6: + encap = "encap seg6 mode encap segs fd00::2"; + break; + case LWT_ENCAP_IOAM6: + encap = "encap ioam6 mode encap tundst fd00::2 " + "trace prealloc type 0x800000 ns 0 size 4 via fd00::2"; + break; + default: + return; + } + + if (lwt_prog) { + unlink(LWT_PIN_PATH); + ret = bpf_program__pin(lwt_prog, LWT_PIN_PATH); + if (!ASSERT_OK(ret, "pin lwt prog")) + return; + pinned = true; + } + + ns = netns_new(LWT_NETNS, true); + if (!ASSERT_OK_PTR(ns, "netns_new")) + goto close; + + tap_fd = open_tuntap(TAP_NAME, true); + if (!ASSERT_GE(tap_fd, 0, "open_tuntap")) + goto close; + + SYS(close, "ip link set dev " TAP_NAME " address " RX_MAC); + SYS(close, "sysctl -wq net.ipv6.conf.all.forwarding=1"); + SYS(close, "ip addr add fd00::1/64 dev " TAP_NAME " nodad"); + SYS(close, "ip link set dev " TAP_NAME " up"); + SYS(close, "ip neigh add fd00::2 lladdr " TX_MAC " nud permanent dev " TAP_NAME); + SYS(close, "ip -6 route add fd00:1::/64 %s dev %s", encap, TAP_NAME); + + tap_ifindex = if_nametoindex(TAP_NAME); + if (!ASSERT_GE(tap_ifindex, 0, "if_nametoindex")) + goto close; + + ret = bpf_xdp_attach(tap_ifindex, bpf_program__fd(skel->progs.ing_xdp), + 0, NULL); + if (!ASSERT_GE(ret, 0, "bpf_xdp_attach")) + goto close; + + tc_hook.ifindex = tap_ifindex; + ret = bpf_tc_hook_create(&tc_hook); + if (!ASSERT_OK(ret, "bpf_tc_hook_create")) + goto close; + + tc_opts.prog_fd = bpf_program__fd(skel->progs.tc_is_meta_empty); + ret = bpf_tc_attach(&tc_hook, &tc_opts); + if (!ASSERT_OK(ret, "bpf_tc_attach")) + goto close; + + ret = write_test_packet_udp(tap_fd); + if (!ASSERT_OK(ret, "write_test_packet_udp")) + goto close; + + if (!ASSERT_TRUE(skel->bss->test_pass, "test_pass")) + dump_err_stream(skel->progs.tc_is_meta_empty); + +close: + if (tap_fd >= 0) + close(tap_fd); + netns_free(ns); + if (pinned) + unlink(LWT_PIN_PATH); +} + +void test_xdp_context_lwt_encap(void) +{ + struct test_xdp_meta *skel; + + skel = test_xdp_meta__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open and load skeleton")) + return; + + if (test__start_subtest("bpf_encap")) + test_lwt_encap(skel, LWT_ENCAP_BPF); + if (test__start_subtest("mpls_encap")) + test_lwt_encap(skel, LWT_ENCAP_MPLS); + if (test__start_subtest("seg6_encap")) + test_lwt_encap(skel, LWT_ENCAP_SEG6); + if (test__start_subtest("ioam6_encap")) + test_lwt_encap(skel, LWT_ENCAP_IOAM6); + + test_xdp_meta__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c index fa73b17cb999..08b03be0b891 100644 --- a/tools/testing/selftests/bpf/progs/test_xdp_meta.c +++ b/tools/testing/selftests/bpf/progs/test_xdp_meta.c @@ -21,10 +21,6 @@ bool test_pass; -static const __u8 smac_want[ETH_ALEN] = { - 0x12, 0x34, 0xDE, 0xAD, 0xBE, 0xEF, -}; - static const __u8 meta_want[META_SIZE] = { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, @@ -32,11 +28,6 @@ static const __u8 meta_want[META_SIZE] = { 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, }; -static bool check_smac(const struct ethhdr *eth) -{ - return !__builtin_memcmp(eth->h_source, smac_want, ETH_ALEN); -} - static bool check_metadata(const char *file, int line, __u8 *meta_have) { if (!__builtin_memcmp(meta_have, meta_want, META_SIZE)) @@ -280,18 +271,47 @@ fail: return TC_ACT_SHOT; } +/* Test packets carry test metadata pattern as payload. */ +static bool is_test_packet_xdp(struct xdp_md *ctx) +{ + __u8 meta_have[META_SIZE]; + __u32 len; + + len = bpf_xdp_get_buff_len(ctx); + if (len < META_SIZE) + return false; + if (bpf_xdp_load_bytes(ctx, len - META_SIZE, meta_have, META_SIZE)) + return false; + if (__builtin_memcmp(meta_have, meta_want, META_SIZE)) + return false; + + return true; +} + +/* Test packets carry test metadata pattern as payload. */ +static bool is_test_packet_tc(struct __sk_buff *ctx) +{ + __u8 meta_have[META_SIZE]; + + if (ctx->len < META_SIZE) + return false; + if (bpf_skb_load_bytes(ctx, ctx->len - META_SIZE, meta_have, META_SIZE)) + return false; + if (__builtin_memcmp(meta_have, meta_want, META_SIZE)) + return false; + + return true; +} + /* Reserve and clear space for metadata but don't populate it */ SEC("xdp") int ing_xdp_zalloc_meta(struct xdp_md *ctx) { - struct ethhdr *eth = ctx_ptr(ctx, data); __u8 *meta; int ret; /* Drop any non-test packets */ - if (eth + 1 > ctx_ptr(ctx, data_end)) - return XDP_DROP; - if (!check_smac(eth)) + if (!is_test_packet_xdp(ctx)) return XDP_DROP; ret = bpf_xdp_adjust_meta(ctx, -META_SIZE); @@ -310,33 +330,24 @@ int ing_xdp_zalloc_meta(struct xdp_md *ctx) SEC("xdp") int ing_xdp(struct xdp_md *ctx) { - __u8 *data, *data_meta, *data_end, *payload; - struct ethhdr *eth; + __u8 *data, *data_meta; int ret; + /* Drop any non-test packets */ + if (!is_test_packet_xdp(ctx)) + return XDP_DROP; + ret = bpf_xdp_adjust_meta(ctx, -META_SIZE); if (ret < 0) return XDP_DROP; data_meta = ctx_ptr(ctx, data_meta); - data_end = ctx_ptr(ctx, data_end); data = ctx_ptr(ctx, data); - eth = (struct ethhdr *)data; - payload = data + sizeof(struct ethhdr); - - if (payload + META_SIZE > data_end || - data_meta + META_SIZE > data) + if (data_meta + META_SIZE > data) return XDP_DROP; - /* The Linux networking stack may send other packets on the test - * interface that interfere with the test. Just drop them. - * The test packets can be recognized by their source MAC address. - */ - if (!check_smac(eth)) - return XDP_DROP; - - __builtin_memcpy(data_meta, payload, META_SIZE); + __builtin_memcpy(data_meta, meta_want, META_SIZE); return XDP_PASS; } @@ -353,7 +364,7 @@ int clone_data_meta_survives_data_write(struct __sk_buff *ctx) if (eth + 1 > ctx_ptr(ctx, data_end)) goto out; /* Ignore non-test packets */ - if (!check_smac(eth)) + if (!is_test_packet_tc(ctx)) goto out; if (meta_have + META_SIZE > eth) @@ -383,7 +394,7 @@ int clone_data_meta_survives_meta_write(struct __sk_buff *ctx) if (eth + 1 > ctx_ptr(ctx, data_end)) goto out; /* Ignore non-test packets */ - if (!check_smac(eth)) + if (!is_test_packet_tc(ctx)) goto out; if (meta_have + META_SIZE > eth) @@ -416,7 +427,7 @@ int clone_meta_dynptr_survives_data_slice_write(struct __sk_buff *ctx) if (!eth) goto out; /* Ignore non-test packets */ - if (!check_smac(eth)) + if (!is_test_packet_tc(ctx)) goto out; bpf_dynptr_from_skb_meta(ctx, 0, &meta); @@ -436,16 +447,11 @@ out: SEC("tc") int clone_meta_dynptr_survives_meta_slice_write(struct __sk_buff *ctx) { - struct bpf_dynptr data, meta; - const struct ethhdr *eth; + struct bpf_dynptr meta; __u8 *meta_have; - bpf_dynptr_from_skb(ctx, 0, &data); - eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); - if (!eth) - goto out; /* Ignore non-test packets */ - if (!check_smac(eth)) + if (!is_test_packet_tc(ctx)) goto out; bpf_dynptr_from_skb_meta(ctx, 0, &meta); @@ -471,15 +477,10 @@ int clone_meta_dynptr_rw_before_data_dynptr_write(struct __sk_buff *ctx) { struct bpf_dynptr data, meta; __u8 meta_have[META_SIZE]; - const struct ethhdr *eth; int err; - bpf_dynptr_from_skb(ctx, 0, &data); - eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); - if (!eth) - goto out; /* Ignore non-test packets */ - if (!check_smac(eth)) + if (!is_test_packet_tc(ctx)) goto out; /* Expect read-write metadata before unclone */ @@ -492,6 +493,7 @@ int clone_meta_dynptr_rw_before_data_dynptr_write(struct __sk_buff *ctx) goto out; /* Helper write to payload will unclone the packet */ + bpf_dynptr_from_skb(ctx, 0, &data); bpf_dynptr_write(&data, offsetof(struct ethhdr, h_proto), "x", 1, 0); err = bpf_dynptr_read(meta_have, META_SIZE, &meta, 0, 0); @@ -511,17 +513,12 @@ out: SEC("tc") int clone_meta_dynptr_rw_before_meta_dynptr_write(struct __sk_buff *ctx) { - struct bpf_dynptr data, meta; + struct bpf_dynptr meta; __u8 meta_have[META_SIZE]; - const struct ethhdr *eth; int err; - bpf_dynptr_from_skb(ctx, 0, &data); - eth = bpf_dynptr_slice(&data, 0, NULL, sizeof(*eth)); - if (!eth) - goto out; /* Ignore non-test packets */ - if (!check_smac(eth)) + if (!is_test_packet_tc(ctx)) goto out; /* Expect read-write metadata before unclone */ @@ -545,6 +542,28 @@ out: return TC_ACT_SHOT; } +SEC("lwt_xmit") +int dummy_lwt_xmit(struct __sk_buff *ctx) +{ + if (bpf_skb_change_head(ctx, sizeof(struct ipv6hdr), 0)) + return BPF_DROP; + + return BPF_OK; +} + +SEC("tc") +int tc_is_meta_empty(struct __sk_buff *ctx) +{ + if (!is_test_packet_tc(ctx)) + return TC_ACT_OK; + + if (ctx->data_meta != ctx->data) + return TC_ACT_OK; + + test_pass = true; + return TC_ACT_OK; +} + SEC("tc") int helper_skb_vlan_push_pop(struct __sk_buff *ctx) { |
