summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2026-04-09 18:24:34 -0700
committerJakub Kicinski <kuba@kernel.org>2026-04-09 18:24:35 -0700
commit15089225889ba4b29f0263757cd66932fa676cb0 (patch)
tree73b8cc252fcebbafad57f5b100c2f774eb7a42c1 /include
parentb6e39e48469e37057fce27a1b87cf6d3e456aa42 (diff)
parent65d657d806848add1e1f0632562d7f47d5d5c188 (diff)
Merge branch 'netkit-support-for-io_uring-zero-copy-and-af_xdp'
Daniel Borkmann says: ==================== netkit: Support for io_uring zero-copy and AF_XDP Containers use virtual netdevs to route traffic from a physical netdev in the host namespace. They do not have access to the physical netdev in the host and thus can't use memory providers or AF_XDP that require reconfiguring/restarting queues in the physical netdev. This patchset adds the concept of queue leasing to virtual netdevs that allow containers to use memory providers and AF_XDP at native speed. Leased queues are bound to a real queue in a physical netdev and act as a proxy. Memory providers and AF_XDP operations take an ifindex and queue id, so containers would pass in an ifindex for a virtual netdev and a queue id of a leased queue, which then gets proxied to the underlying real queue. We have implemented support for this concept in netkit and tested the latter against Nvidia ConnectX-6 (mlx5) as well as Broadcom BCM957504 (bnxt_en) 100G NICs. For more details see the individual patches. ==================== Link: https://patch.msgid.link/20260402231031.447597-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'include')
-rw-r--r--include/linux/netdevice.h11
-rw-r--r--include/net/netdev_queues.h23
-rw-r--r--include/net/netdev_rx_queue.h29
-rw-r--r--include/net/page_pool/memory_provider.h8
-rw-r--r--include/uapi/linux/if_link.h6
-rw-r--r--include/uapi/linux/netdev.h11
6 files changed, 75 insertions, 13 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e15367373f7c..47417b2d48a4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2561,7 +2561,14 @@ struct net_device {
* Also protects some fields in:
* struct napi_struct, struct netdev_queue, struct netdev_rx_queue
*
- * Ordering: take after rtnl_lock.
+ * Ordering:
+ *
+ * - take after rtnl_lock
+ *
+ * - for the case of netdev queue leasing, the netdev-scope lock is
+ * taken for both the virtual and the physical device; to prevent
+ * deadlocks, the virtual device's lock must always be acquired
+ * before the physical device's (see netdev_nl_queue_create_doit)
*/
struct mutex lock;
@@ -3413,6 +3420,8 @@ static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
int register_netdevice(struct net_device *dev);
void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
void unregister_netdevice_many(struct list_head *head);
+bool unregister_netdevice_queued(const struct net_device *dev);
+
static inline void unregister_netdevice(struct net_device *dev)
{
unregister_netdevice_queue(dev, NULL);
diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
index 95ed28212f4e..70c9fe9e83cc 100644
--- a/include/net/netdev_queues.h
+++ b/include/net/netdev_queues.h
@@ -150,6 +150,11 @@ enum {
* When NIC-wide config is changed the callback will
* be invoked for all queues.
*
+ * @ndo_queue_create: Create a new RX queue on a virtual device that will
+ * be paired with a physical device's queue via leasing.
+ * Return the new queue id on success, negative error
+ * on failure.
+ *
* @supported_params: Bitmask of supported parameters, see QCFG_*.
*
* Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while
@@ -178,6 +183,8 @@ struct netdev_queue_mgmt_ops {
struct netlink_ext_ack *extack);
struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev,
int idx);
+ int (*ndo_queue_create)(struct net_device *dev,
+ struct netlink_ext_ack *extack);
unsigned int supported_params;
};
@@ -185,7 +192,7 @@ struct netdev_queue_mgmt_ops {
void netdev_queue_config(struct net_device *dev, int rxq,
struct netdev_queue_config *qcfg);
-bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx);
+bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx);
/**
* DOC: Lockless queue stopping / waking helpers.
@@ -373,6 +380,14 @@ static inline unsigned int netif_xmit_timeout_ms(struct netdev_queue *txq)
get_desc, start_thrs); \
})
-struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx);
-
-#endif
+struct device *netdev_queue_get_dma_dev(struct net_device *dev,
+ unsigned int idx,
+ enum netdev_queue_type type);
+bool netdev_can_create_queue(const struct net_device *dev,
+ struct netlink_ext_ack *extack);
+bool netdev_can_lease_queue(const struct net_device *dev,
+ struct netlink_ext_ack *extack);
+bool netdev_queue_busy(struct net_device *dev, unsigned int idx,
+ enum netdev_queue_type type,
+ struct netlink_ext_ack *extack);
+#endif /* _LINUX_NET_QUEUES_H */
diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h
index 08f81329fc11..7e98c679ea84 100644
--- a/include/net/netdev_rx_queue.h
+++ b/include/net/netdev_rx_queue.h
@@ -31,6 +31,14 @@ struct netdev_rx_queue {
struct napi_struct *napi;
struct netdev_queue_config qcfg;
struct pp_memory_provider_params mp_params;
+
+ /* If a queue is leased, then the lease pointer is always
+ * valid. From the physical device it points to the virtual
+ * queue, and from the virtual device it points to the
+ * physical queue.
+ */
+ struct netdev_rx_queue *lease;
+ netdevice_tracker lease_tracker;
} ____cacheline_aligned_in_smp;
/*
@@ -59,6 +67,23 @@ get_netdev_rx_queue_index(struct netdev_rx_queue *queue)
return index;
}
-int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq);
+enum netif_lease_dir {
+ NETIF_VIRT_TO_PHYS,
+ NETIF_PHYS_TO_VIRT,
+};
-#endif
+struct netdev_rx_queue *
+__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq,
+ enum netif_lease_dir dir);
+
+struct netdev_rx_queue *
+netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq);
+void netif_put_rx_queue_lease_locked(struct net_device *orig_dev,
+ struct net_device *dev);
+
+int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq);
+void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst,
+ struct netdev_rx_queue *rxq_src);
+void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst,
+ struct netdev_rx_queue *rxq_src);
+#endif /* _LINUX_NETDEV_RX_QUEUE_H */
diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h
index ada4f968960a..255ce4cfd975 100644
--- a/include/net/page_pool/memory_provider.h
+++ b/include/net/page_pool/memory_provider.h
@@ -23,14 +23,10 @@ bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr);
void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov);
void net_mp_niov_clear_page_pool(struct net_iov *niov);
-int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx,
- struct pp_memory_provider_params *p);
-int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
+int netif_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
const struct pp_memory_provider_params *p,
struct netlink_ext_ack *extack);
-void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx,
- struct pp_memory_provider_params *old_p);
-void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx,
+void netif_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx,
const struct pp_memory_provider_params *old_p);
/**
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 83a96c56b8ca..280bb1780512 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1296,6 +1296,11 @@ enum netkit_mode {
NETKIT_L3,
};
+enum netkit_pairing {
+ NETKIT_DEVICE_PAIR,
+ NETKIT_DEVICE_SINGLE,
+};
+
/* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to
* the BPF program if attached. This also means the latter can
* consume the two fields if they were populated earlier.
@@ -1320,6 +1325,7 @@ enum {
IFLA_NETKIT_PEER_SCRUB,
IFLA_NETKIT_HEADROOM,
IFLA_NETKIT_TAILROOM,
+ IFLA_NETKIT_PAIRING,
__IFLA_NETKIT_MAX,
};
#define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1)
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index e0b579a1df4f..7df1056a35fd 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -160,6 +160,7 @@ enum {
NETDEV_A_QUEUE_DMABUF,
NETDEV_A_QUEUE_IO_URING,
NETDEV_A_QUEUE_XSK,
+ NETDEV_A_QUEUE_LEASE,
__NETDEV_A_QUEUE_MAX,
NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
@@ -203,6 +204,15 @@ enum {
};
enum {
+ NETDEV_A_LEASE_IFINDEX = 1,
+ NETDEV_A_LEASE_QUEUE,
+ NETDEV_A_LEASE_NETNS_ID,
+
+ __NETDEV_A_LEASE_MAX,
+ NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1)
+};
+
+enum {
NETDEV_A_DMABUF_IFINDEX = 1,
NETDEV_A_DMABUF_QUEUES,
NETDEV_A_DMABUF_FD,
@@ -228,6 +238,7 @@ enum {
NETDEV_CMD_BIND_RX,
NETDEV_CMD_NAPI_SET,
NETDEV_CMD_BIND_TX,
+ NETDEV_CMD_QUEUE_CREATE,
__NETDEV_CMD_MAX,
NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)