From 7789c6bb76acf21539c2c74b0cc869bb57de99e6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 3 Apr 2026 01:10:18 +0200 Subject: net: Add queue-create operation Add a ynl netdev family operation called queue-create that creates a new queue on a netdevice: name: queue-create attribute-set: queue flags: [admin-perm] do: request: attributes: - ifindex - type - lease reply: &queue-create-op attributes: - id This is a generic operation such that it can be extended for various use cases in future. Right now it is mandatory to specify ifindex, the queue type which is enforced to rx and a lease. The newly created queue id is returned to the caller. A queue from a virtual device can have a lease which refers to another queue from a physical device. This is useful for memory providers and AF_XDP operations which take an ifindex and queue id to allow applications to bind against virtual devices in containers. The lease couples both queues together and allows to proxy the operations from a virtual device in a container to the physical device. In future, the nested lease attribute can be lifted and made optional for other use-cases such as dynamic queue creation for physical netdevs. The lack of lease and the specification of the physical device as an ifindex will imply that we need a real queue to be allocated. Similarly, the queue type enforcement to rx can then be lifted as well to support tx. An early implementation had only driver-specific integration [0], but in order for other virtual devices to reuse, it makes sense to have this as a generic API in core net. For leasing queues, the virtual netdev must have real_num_rx_queues less than num_rx_queues at the time of calling queue-create. The queue-type must be rx as only rx queues are supported for leasing for now. We also enforce that the queue-create ifindex must point to a virtual device, and that the nested lease attribute's ifindex must point to a physical device. The nested lease attribute set contains a netns-id attribute which is optional and can specify a netns-id relative to the caller's netns. It requires cap_net_admin and if the netns-id attribute is not specified, the lease ifindex will be retrieved from the current netns. Also, it is modeled as an s32 type similarly as done elsewhere in the stack. Signed-off-by: Daniel Borkmann Co-developed-by: David Wei Signed-off-by: David Wei Acked-by: Stanislav Fomichev Reviewed-by: Nikolay Aleksandrov Link: https://bpfconf.ebpf.io/bpfconf2025/bpfconf2025_material/lsfmmbpf_2025_netkit_borkmann.pdf [0] Link: https://patch.msgid.link/20260402231031.447597-2-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- include/uapi/linux/netdev.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index e0b579a1df4f..7df1056a35fd 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -160,6 +160,7 @@ enum { NETDEV_A_QUEUE_DMABUF, NETDEV_A_QUEUE_IO_URING, NETDEV_A_QUEUE_XSK, + NETDEV_A_QUEUE_LEASE, __NETDEV_A_QUEUE_MAX, NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) @@ -202,6 +203,15 @@ enum { NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) }; +enum { + NETDEV_A_LEASE_IFINDEX = 1, + NETDEV_A_LEASE_QUEUE, + NETDEV_A_LEASE_NETNS_ID, + + __NETDEV_A_LEASE_MAX, + NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1) +}; + enum { NETDEV_A_DMABUF_IFINDEX = 1, NETDEV_A_DMABUF_QUEUES, @@ -228,6 +238,7 @@ enum { NETDEV_CMD_BIND_RX, NETDEV_CMD_NAPI_SET, NETDEV_CMD_BIND_TX, + NETDEV_CMD_QUEUE_CREATE, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) -- cgit v1.2.3 From 48103896053828a8b4d25839a39aa8514071914a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 3 Apr 2026 01:10:27 +0200 Subject: netkit: Add single device mode for netkit Add a single device mode for netkit instead of netkit pairs. The primary target for the paired devices is to connect network namespaces, of course, and support has been implemented in projects like Cilium [0]. For the rxq leasing the plan is to support two main scenarios related to single device mode: * For the use-case of io_uring zero-copy, the control plane can either set up a netkit pair where the peer device can perform rxq leasing which is then tied to the lifetime of the peer device, or the control plane can use a regular netkit pair to connect the hostns to a Pod/container and dynamically add/remove rxq leasing through a single device without having to interrupt the device pair. In the case of io_uring, the memory pool is used as skb non-linear pages, and thus the skb will go its way through the regular stack into netkit. Things like the netkit policy when no BPF is attached or skb scrubbing etc apply as-is in case the paired devices are used, or if the backend memory is tied to the single device and traffic goes through a paired device. * For the use-case of AF_XDP, the control plane needs to use netkit in the single device mode. The single device mode currently enforces only a pass policy when no BPF is attached, and does not yet support BPF link attachments for AF_XDP. skbs sent to that device get dropped at the moment. Given AF_XDP operates at a lower layer of the stack tying this to the netkit pair did not make sense. In future, the plan is to allow BPF at the XDP layer which can: i) process traffic coming from the AF_XDP application (e.g. QEMU with AF_XDP backend) to filter egress traffic or to push selected egress traffic up to the single netkit device to the local stack (e.g. DHCP requests), and ii) vice-versa skbs sent to the single netkit into the AF_XDP application (e.g. DHCP replies). Also, the control-plane can dynamically manage rxq leasing for the single netkit device without having to interrupt (e.g. down/up cycle) the main netkit pair for the Pod which has traffic going in and out. Signed-off-by: Daniel Borkmann Co-developed-by: David Wei Signed-off-by: David Wei Reviewed-by: Jordan Rife Reviewed-by: Nikolay Aleksandrov Link: https://docs.cilium.io/en/stable/operations/performance/tuning/#netkit-device-mode [0] Link: https://patch.msgid.link/20260402231031.447597-11-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_link.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 83a96c56b8ca..280bb1780512 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1296,6 +1296,11 @@ enum netkit_mode { NETKIT_L3, }; +enum netkit_pairing { + NETKIT_DEVICE_PAIR, + NETKIT_DEVICE_SINGLE, +}; + /* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to * the BPF program if attached. This also means the latter can * consume the two fields if they were populated earlier. @@ -1320,6 +1325,7 @@ enum { IFLA_NETKIT_PEER_SCRUB, IFLA_NETKIT_HEADROOM, IFLA_NETKIT_TAILROOM, + IFLA_NETKIT_PAIRING, __IFLA_NETKIT_MAX, }; #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) -- cgit v1.2.3