summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@kernel.org>2026-01-27 09:11:30 -0800
committerAlexei Starovoitov <ast@kernel.org>2026-01-27 09:12:09 -0800
commit35538dba51b4a64f790aefdc6972772dc36b9826 (patch)
tree597b4c1d7bf7711314914dd0311c06e13a645345 /include/linux
parent1456ebb291ddee67c9144c8f7f38a6dddcd32ed7 (diff)
parent17e2ce02bf5669dfa659976e93d409228cba98f9 (diff)
Merge branch 'bpf-fix-fionread-and-copied_seq-issues'
Jiayuan Chen says: ==================== bpf: Fix FIONREAD and copied_seq issues syzkaller reported a bug [1] where a socket using sockmap, after being unloaded, exposed incorrect copied_seq calculation. The selftest I provided can be used to reproduce the issue reported by syzkaller. TCP recvmsg seq # bug 2: copied E92C873, seq E68D125, rcvnxt E7CEB7C, fl 40 WARNING: CPU: 1 PID: 5997 at net/ipv4/tcp.c:2724 tcp_recvmsg_locked+0xb2f/0x2910 net/ipv4/tcp.c:2724 Call Trace: <TASK> receive_fallback_to_copy net/ipv4/tcp.c:1968 [inline] tcp_zerocopy_receive+0x131a/0x2120 net/ipv4/tcp.c:2200 do_tcp_getsockopt+0xe28/0x26c0 net/ipv4/tcp.c:4713 tcp_getsockopt+0xdf/0x100 net/ipv4/tcp.c:4812 do_sock_getsockopt+0x34d/0x440 net/socket.c:2421 __sys_getsockopt+0x12f/0x260 net/socket.c:2450 __do_sys_getsockopt net/socket.c:2457 [inline] __se_sys_getsockopt net/socket.c:2454 [inline] __x64_sys_getsockopt+0xbd/0x160 net/socket.c:2454 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xcd/0xfa0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f A sockmap socket maintains its own receive queue (ingress_msg) which may contain data from either its own protocol stack or forwarded from other sockets. FD1:read() -- FD1->copied_seq++ | [read data] | [enqueue data] v [sockmap] -> ingress to self -> ingress_msg queue FD1 native stack ------> ^ -- FD1->rcv_nxt++ -> redirect to other | [enqueue data] | | | ingress to FD1 v ^ ... | [sockmap] FD2 native stack The issue occurs when reading from ingress_msg: we update tp->copied_seq by default, but if the data comes from other sockets (not the socket's own protocol stack), tcp->rcv_nxt remains unchanged. Later, when converting back to a native socket, reads may fail as copied_seq could be significantly larger than rcv_nxt. Additionally, FIONREAD calculation based on copied_seq and rcv_nxt is insufficient for sockmap sockets, requiring separate field tracking. [1] https://syzkaller.appspot.com/bug?extid=06dbd397158ec0ea4983 --- v7 -> v9: Address Jakub Sitnicki's feedback: - Remove sk_receive_queue check in tcp_bpf_ioctl, only report ingress_msg data length for FIONREAD - Minor nits fixes - Add Reviewed-by tag from John Fastabend - Fix ci error https://lore.kernel.org/bpf/20260113025121.197535-1-jiayuan.chen@linux.dev/ v5 -> v7: Some modifications suggested by Jakub Sitnicki, and added Reviewed-by tag. https://lore.kernel.org/bpf/20260106051458.279151-1-jiayuan.chen@linux.dev/ v1 -> v5: Use skmsg.sk instead of extending BPF_F_XXX macro and fix CI failure reported by CI v1: https://lore.kernel.org/bpf/20251117110736.293040-1-jiayuan.chen@linux.dev/ ==================== Link: https://patch.msgid.link/20260124113314.113584-1-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/skmsg.h70
1 files changed, 68 insertions, 2 deletions
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 49847888c287..829b281d6c9c 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -97,6 +97,8 @@ struct sk_psock {
struct sk_buff_head ingress_skb;
struct list_head ingress_msg;
spinlock_t ingress_lock;
+ /** @msg_tot_len: Total bytes queued in ingress_msg list. */
+ u32 msg_tot_len;
unsigned long state;
struct list_head link;
spinlock_t link_lock;
@@ -141,6 +143,8 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
struct sk_msg *msg, u32 bytes);
int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
int len, int flags);
+int __sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
+ int len, int flags, int *copied_from_self);
bool sk_msg_is_readable(struct sock *sk);
static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes)
@@ -319,6 +323,27 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb)
kfree_skb(skb);
}
+static inline u32 sk_psock_get_msg_len_nolock(struct sk_psock *psock)
+{
+ /* Used by ioctl to read msg_tot_len only; lock-free for performance */
+ return READ_ONCE(psock->msg_tot_len);
+}
+
+static inline void sk_psock_msg_len_add_locked(struct sk_psock *psock, int diff)
+{
+ /* Use WRITE_ONCE to ensure correct read in sk_psock_get_msg_len_nolock().
+ * ingress_lock should be held to prevent concurrent updates to msg_tot_len
+ */
+ WRITE_ONCE(psock->msg_tot_len, psock->msg_tot_len + diff);
+}
+
+static inline void sk_psock_msg_len_add(struct sk_psock *psock, int diff)
+{
+ spin_lock_bh(&psock->ingress_lock);
+ sk_psock_msg_len_add_locked(psock, diff);
+ spin_unlock_bh(&psock->ingress_lock);
+}
+
static inline bool sk_psock_queue_msg(struct sk_psock *psock,
struct sk_msg *msg)
{
@@ -327,6 +352,7 @@ static inline bool sk_psock_queue_msg(struct sk_psock *psock,
spin_lock_bh(&psock->ingress_lock);
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
list_add_tail(&msg->list, &psock->ingress_msg);
+ sk_psock_msg_len_add_locked(psock, msg->sg.size);
ret = true;
} else {
sk_msg_free(psock->sk, msg);
@@ -343,18 +369,25 @@ static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock)
spin_lock_bh(&psock->ingress_lock);
msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
- if (msg)
+ if (msg) {
list_del(&msg->list);
+ sk_psock_msg_len_add_locked(psock, -msg->sg.size);
+ }
spin_unlock_bh(&psock->ingress_lock);
return msg;
}
+static inline struct sk_msg *sk_psock_peek_msg_locked(struct sk_psock *psock)
+{
+ return list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
+}
+
static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock)
{
struct sk_msg *msg;
spin_lock_bh(&psock->ingress_lock);
- msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list);
+ msg = sk_psock_peek_msg_locked(psock);
spin_unlock_bh(&psock->ingress_lock);
return msg;
}
@@ -521,6 +554,39 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
return !!psock->saved_data_ready;
}
+/* for tcp only, sk is locked */
+static inline ssize_t sk_psock_msg_inq(struct sock *sk)
+{
+ struct sk_psock *psock;
+ ssize_t inq = 0;
+
+ psock = sk_psock_get(sk);
+ if (likely(psock)) {
+ inq = sk_psock_get_msg_len_nolock(psock);
+ sk_psock_put(sk, psock);
+ }
+ return inq;
+}
+
+/* for udp only, sk is not locked */
+static inline ssize_t sk_msg_first_len(struct sock *sk)
+{
+ struct sk_psock *psock;
+ struct sk_msg *msg;
+ ssize_t inq = 0;
+
+ psock = sk_psock_get(sk);
+ if (likely(psock)) {
+ spin_lock_bh(&psock->ingress_lock);
+ msg = sk_psock_peek_msg_locked(psock);
+ if (msg)
+ inq = msg->sg.size;
+ spin_unlock_bh(&psock->ingress_lock);
+ sk_psock_put(sk, psock);
+ }
+ return inq;
+}
+
#if IS_ENABLED(CONFIG_NET_SOCK_MSG)
#define BPF_F_STRPARSER (1UL << 1)