From 79f80a7a47849ef1b3c25a0bedcc448b9cb551c1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 6 Jan 2026 12:05:27 -0500 Subject: mptcp: fallback earlier on simult connection [ Upstream commit 71154bbe49423128c1c8577b6576de1ed6836830 ] Syzkaller reports a simult-connect race leading to inconsistent fallback status: WARNING: CPU: 3 PID: 33 at net/mptcp/subflow.c:1515 subflow_data_ready+0x40b/0x7c0 net/mptcp/subflow.c:1515 Modules linked in: CPU: 3 UID: 0 PID: 33 Comm: ksoftirqd/3 Not tainted syzkaller #0 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 RIP: 0010:subflow_data_ready+0x40b/0x7c0 net/mptcp/subflow.c:1515 Code: 89 ee e8 78 61 3c f6 40 84 ed 75 21 e8 8e 66 3c f6 44 89 fe bf 07 00 00 00 e8 c1 61 3c f6 41 83 ff 07 74 09 e8 76 66 3c f6 90 <0f> 0b 90 e8 6d 66 3c f6 48 89 df e8 e5 ad ff ff 31 ff 89 c5 89 c6 RSP: 0018:ffffc900006cf338 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff888031acd100 RCX: ffffffff8b7f2abf RDX: ffff88801e6ea440 RSI: ffffffff8b7f2aca RDI: 0000000000000005 RBP: 0000000000000000 R08: 0000000000000005 R09: 0000000000000007 R10: 0000000000000004 R11: 0000000000002c10 R12: ffff88802ba69900 R13: 1ffff920000d9e67 R14: ffff888046f81800 R15: 0000000000000004 FS: 0000000000000000(0000) GS:ffff8880d69bc000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000560fc0ca1670 CR3: 0000000032c3a000 CR4: 0000000000352ef0 Call Trace: tcp_data_queue+0x13b0/0x4f90 net/ipv4/tcp_input.c:5197 tcp_rcv_state_process+0xfdf/0x4ec0 net/ipv4/tcp_input.c:6922 tcp_v6_do_rcv+0x492/0x1740 net/ipv6/tcp_ipv6.c:1672 tcp_v6_rcv+0x2976/0x41e0 net/ipv6/tcp_ipv6.c:1918 ip6_protocol_deliver_rcu+0x188/0x1520 net/ipv6/ip6_input.c:438 ip6_input_finish+0x1e4/0x4b0 net/ipv6/ip6_input.c:489 NF_HOOK include/linux/netfilter.h:318 [inline] NF_HOOK include/linux/netfilter.h:312 [inline] ip6_input+0x105/0x2f0 net/ipv6/ip6_input.c:500 dst_input include/net/dst.h:471 [inline] ip6_rcv_finish net/ipv6/ip6_input.c:79 [inline] NF_HOOK include/linux/netfilter.h:318 [inline] NF_HOOK include/linux/netfilter.h:312 [inline] ipv6_rcv+0x264/0x650 net/ipv6/ip6_input.c:311 __netif_receive_skb_one_core+0x12d/0x1e0 net/core/dev.c:5979 __netif_receive_skb+0x1d/0x160 net/core/dev.c:6092 process_backlog+0x442/0x15e0 net/core/dev.c:6444 __napi_poll.constprop.0+0xba/0x550 net/core/dev.c:7494 napi_poll net/core/dev.c:7557 [inline] net_rx_action+0xa9f/0xfe0 net/core/dev.c:7684 handle_softirqs+0x216/0x8e0 kernel/softirq.c:579 run_ksoftirqd kernel/softirq.c:968 [inline] run_ksoftirqd+0x3a/0x60 kernel/softirq.c:960 smpboot_thread_fn+0x3f7/0xae0 kernel/smpboot.c:160 kthread+0x3c2/0x780 kernel/kthread.c:463 ret_from_fork+0x5d7/0x6f0 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 The TCP subflow can process the simult-connect syn-ack packet after transitioning to TCP_FIN1 state, bypassing the MPTCP fallback check, as the sk_state_change() callback is not invoked for * -> FIN_WAIT1 transitions. That will move the msk socket to an inconsistent status and the next incoming data will hit the reported splat. Close the race moving the simult-fallback check at the earliest possible stage - that is at syn-ack generation time. About the fixes tags: [2] was supposed to also fix this issue introduced by [3]. [1] is required as a dependence: it was not explicitly marked as a fix, but it is one and it has already been backported before [3]. In other words, this commit should be backported up to [3], including [2] and [1] if that's not already there. Fixes: 23e89e8ee7be ("tcp: Don't drop SYN+ACK for simultaneous connect().") [1] Fixes: 4fd19a307016 ("mptcp: fix inconsistent state on fastopen race") [2] Fixes: 1e777f39b4d7 ("mptcp: add MSG_FASTOPEN sendmsg flag support") [3] Cc: stable@vger.kernel.org Reported-by: syzbot+0ff6b771b4f7a5bce83b@syzkaller.appspotmail.com Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/586 Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251212-net-mptcp-subflow_data_ready-warn-v1-1-d1f9fd1c36c8@kernel.org Signed-off-by: Paolo Abeni [ adapted mptcp_try_fallback() call ] Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/mptcp/options.c | 10 ++++++++++ net/mptcp/protocol.h | 6 ++---- net/mptcp/subflow.c | 10 +--------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index bc089388530b..b9c8205fadbf 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -408,6 +408,16 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, */ subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; if (subflow->request_mptcp) { + if (unlikely(subflow_simultaneous_connect(sk))) { + WARN_ON_ONCE(!mptcp_try_fallback(sk)); + + /* Ensure mptcp_finish_connect() will not process the + * MPC handshake. + */ + subflow->request_mptcp = 0; + return false; + } + opts->suboptions = OPTION_MPTCP_MPC_SYN; opts->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk)); opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk)); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 73b842350677..fcc3ef246166 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1283,10 +1283,8 @@ static inline bool subflow_simultaneous_connect(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - return (1 << sk->sk_state) & - (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING) && - is_active_ssk(subflow) && - !subflow->conn_finished; + /* Note that the sk state implies !subflow->conn_finished. */ + return sk->sk_state == TCP_SYN_RECV && is_active_ssk(subflow); } #ifdef CONFIG_SYN_COOKIES diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index e3d4ed49e588..1618483b05e8 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1848,18 +1848,10 @@ static void subflow_state_change(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct sock *parent = subflow->conn; - struct mptcp_sock *msk; + struct mptcp_sock *msk = mptcp_sk(parent); __subflow_state_change(sk); - msk = mptcp_sk(parent); - if (subflow_simultaneous_connect(sk)) { - WARN_ON_ONCE(!mptcp_try_fallback(sk)); - pr_fallback(msk); - subflow->conn_finished = 1; - mptcp_propagate_state(parent, sk, subflow, NULL); - } - /* as recvmsg() does not acquire the subflow socket for ssk selection * a fin packet carrying a DSS can be unnoticed if we don't trigger * the data available machinery here. -- cgit v1.2.3 From 914769048818021556c940b9163e8056be9507dd Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Tue, 6 Jan 2026 15:35:01 -0500 Subject: mm/page_alloc: change all pageblocks migrate type on coalescing [ Upstream commit 7838a4eb8a1d23160bd3f588ea7f2b8f7c00c55b ] When a page is freed it coalesces with a buddy into a higher order page while possible. When the buddy page migrate type differs, it is expected to be updated to match the one of the page being freed. However, only the first pageblock of the buddy page is updated, while the rest of the pageblocks are left unchanged. That causes warnings in later expand() and other code paths (like below), since an inconsistency between migration type of the list containing the page and the page-owned pageblocks migration types is introduced. [ 308.986589] ------------[ cut here ]------------ [ 308.987227] page type is 0, passed migratetype is 1 (nr=256) [ 308.987275] WARNING: CPU: 1 PID: 5224 at mm/page_alloc.c:812 expand+0x23c/0x270 [ 308.987293] Modules linked in: algif_hash(E) af_alg(E) nft_fib_inet(E) nft_fib_ipv4(E) nft_fib_ipv6(E) nft_fib(E) nft_reject_inet(E) nf_reject_ipv4(E) nf_reject_ipv6(E) nft_reject(E) nft_ct(E) nft_chain_nat(E) nf_nat(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) nf_tables(E) s390_trng(E) vfio_ccw(E) mdev(E) vfio_iommu_type1(E) vfio(E) sch_fq_codel(E) drm(E) i2c_core(E) drm_panel_orientation_quirks(E) loop(E) nfnetlink(E) vsock_loopback(E) vmw_vsock_virtio_transport_common(E) vsock(E) ctcm(E) fsm(E) diag288_wdt(E) watchdog(E) zfcp(E) scsi_transport_fc(E) ghash_s390(E) prng(E) aes_s390(E) des_generic(E) des_s390(E) libdes(E) sha3_512_s390(E) sha3_256_s390(E) sha_common(E) paes_s390(E) crypto_engine(E) pkey_cca(E) pkey_ep11(E) zcrypt(E) rng_core(E) pkey_pckmo(E) pkey(E) autofs4(E) [ 308.987439] Unloaded tainted modules: hmac_s390(E):2 [ 308.987650] CPU: 1 UID: 0 PID: 5224 Comm: mempig_verify Kdump: loaded Tainted: G E 6.18.0-gcc-bpf-debug #431 PREEMPT [ 308.987657] Tainted: [E]=UNSIGNED_MODULE [ 308.987661] Hardware name: IBM 3906 M04 704 (z/VM 7.3.0) [ 308.987666] Krnl PSW : 0404f00180000000 00000349976fa600 (expand+0x240/0x270) [ 308.987676] R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:3 CC:3 PM:0 RI:0 EA:3 [ 308.987682] Krnl GPRS: 0000034980000004 0000000000000005 0000000000000030 000003499a0e6d88 [ 308.987688] 0000000000000005 0000034980000005 000002be803ac000 0000023efe6c8300 [ 308.987692] 0000000000000008 0000034998d57290 000002be00000100 0000023e00000008 [ 308.987696] 0000000000000000 0000000000000000 00000349976fa5fc 000002c99b1eb6f0 [ 308.987708] Krnl Code: 00000349976fa5f0: c020008a02f2 larl %r2,000003499883abd4 00000349976fa5f6: c0e5ffe3f4b5 brasl %r14,0000034997378f60 #00000349976fa5fc: af000000 mc 0,0 >00000349976fa600: a7f4ff4c brc 15,00000349976fa498 00000349976fa604: b9040026 lgr %r2,%r6 00000349976fa608: c0300088317f larl %r3,0000034998800906 00000349976fa60e: c0e5fffdb6e1 brasl %r14,00000349976b13d0 00000349976fa614: af000000 mc 0,0 [ 308.987734] Call Trace: [ 308.987738] [<00000349976fa600>] expand+0x240/0x270 [ 308.987744] ([<00000349976fa5fc>] expand+0x23c/0x270) [ 308.987749] [<00000349976ff95e>] rmqueue_bulk+0x71e/0x940 [ 308.987754] [<00000349976ffd7e>] __rmqueue_pcplist+0x1fe/0x2a0 [ 308.987759] [<0000034997700966>] rmqueue.isra.0+0xb46/0xf40 [ 308.987763] [<0000034997703ec8>] get_page_from_freelist+0x198/0x8d0 [ 308.987768] [<0000034997706fa8>] __alloc_frozen_pages_noprof+0x198/0x400 [ 308.987774] [<00000349977536f8>] alloc_pages_mpol+0xb8/0x220 [ 308.987781] [<0000034997753bf6>] folio_alloc_mpol_noprof+0x26/0xc0 [ 308.987786] [<0000034997753e4c>] vma_alloc_folio_noprof+0x6c/0xa0 [ 308.987791] [<0000034997775b22>] vma_alloc_anon_folio_pmd+0x42/0x240 [ 308.987799] [<000003499777bfea>] __do_huge_pmd_anonymous_page+0x3a/0x210 [ 308.987804] [<00000349976cb08e>] __handle_mm_fault+0x4de/0x500 [ 308.987809] [<00000349976cb14c>] handle_mm_fault+0x9c/0x3a0 [ 308.987813] [<000003499734d70e>] do_exception+0x1de/0x540 [ 308.987822] [<0000034998387390>] __do_pgm_check+0x130/0x220 [ 308.987830] [<000003499839a934>] pgm_check_handler+0x114/0x160 [ 308.987838] 3 locks held by mempig_verify/5224: [ 308.987842] #0: 0000023ea44c1e08 (vm_lock){++++}-{0:0}, at: lock_vma_under_rcu+0xb2/0x2a0 [ 308.987859] #1: 0000023ee4d41b18 (&pcp->lock){+.+.}-{2:2}, at: rmqueue.isra.0+0xad6/0xf40 [ 308.987871] #2: 0000023efe6c8998 (&zone->lock){..-.}-{2:2}, at: rmqueue_bulk+0x5a/0x940 [ 308.987886] Last Breaking-Event-Address: [ 308.987890] [<0000034997379096>] __warn_printk+0x136/0x140 [ 308.987897] irq event stamp: 52330356 [ 308.987901] hardirqs last enabled at (52330355): [<000003499838742e>] __do_pgm_check+0x1ce/0x220 [ 308.987907] hardirqs last disabled at (52330356): [<000003499839932e>] _raw_spin_lock_irqsave+0x9e/0xe0 [ 308.987913] softirqs last enabled at (52329882): [<0000034997383786>] handle_softirqs+0x2c6/0x530 [ 308.987922] softirqs last disabled at (52329859): [<0000034997382f86>] __irq_exit_rcu+0x126/0x140 [ 308.987929] ---[ end trace 0000000000000000 ]--- [ 308.987936] ------------[ cut here ]------------ [ 308.987940] page type is 0, passed migratetype is 1 (nr=256) [ 308.987951] WARNING: CPU: 1 PID: 5224 at mm/page_alloc.c:860 __del_page_from_free_list+0x1be/0x1e0 [ 308.987960] Modules linked in: algif_hash(E) af_alg(E) nft_fib_inet(E) nft_fib_ipv4(E) nft_fib_ipv6(E) nft_fib(E) nft_reject_inet(E) nf_reject_ipv4(E) nf_reject_ipv6(E) nft_reject(E) nft_ct(E) nft_chain_nat(E) nf_nat(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) nf_tables(E) s390_trng(E) vfio_ccw(E) mdev(E) vfio_iommu_type1(E) vfio(E) sch_fq_codel(E) drm(E) i2c_core(E) drm_panel_orientation_quirks(E) loop(E) nfnetlink(E) vsock_loopback(E) vmw_vsock_virtio_transport_common(E) vsock(E) ctcm(E) fsm(E) diag288_wdt(E) watchdog(E) zfcp(E) scsi_transport_fc(E) ghash_s390(E) prng(E) aes_s390(E) des_generic(E) des_s390(E) libdes(E) sha3_512_s390(E) sha3_256_s390(E) sha_common(E) paes_s390(E) crypto_engine(E) pkey_cca(E) pkey_ep11(E) zcrypt(E) rng_core(E) pkey_pckmo(E) pkey(E) autofs4(E) [ 308.988070] Unloaded tainted modules: hmac_s390(E):2 [ 308.988087] CPU: 1 UID: 0 PID: 5224 Comm: mempig_verify Kdump: loaded Tainted: G W E 6.18.0-gcc-bpf-debug #431 PREEMPT [ 308.988095] Tainted: [W]=WARN, [E]=UNSIGNED_MODULE [ 308.988100] Hardware name: IBM 3906 M04 704 (z/VM 7.3.0) [ 308.988105] Krnl PSW : 0404f00180000000 00000349976f9e32 (__del_page_from_free_list+0x1c2/0x1e0) [ 308.988118] R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:3 CC:3 PM:0 RI:0 EA:3 [ 308.988127] Krnl GPRS: 0000034980000004 0000000000000005 0000000000000030 000003499a0e6d88 [ 308.988133] 0000000000000005 0000034980000005 0000034998d57290 0000023efe6c8300 [ 308.988139] 0000000000000001 0000000000000008 000002be00000100 000002be803ac000 [ 308.988144] 0000000000000000 0000000000000001 00000349976f9e2e 000002c99b1eb728 [ 308.988153] Krnl Code: 00000349976f9e22: c020008a06d9 larl %r2,000003499883abd4 00000349976f9e28: c0e5ffe3f89c brasl %r14,0000034997378f60 #00000349976f9e2e: af000000 mc 0,0 >00000349976f9e32: a7f4ff4e brc 15,00000349976f9cce 00000349976f9e36: b904002b lgr %r2,%r11 00000349976f9e3a: c030008a06e7 larl %r3,000003499883ac08 00000349976f9e40: c0e5fffdbac8 brasl %r14,00000349976b13d0 00000349976f9e46: af000000 mc 0,0 [ 308.988184] Call Trace: [ 308.988188] [<00000349976f9e32>] __del_page_from_free_list+0x1c2/0x1e0 [ 308.988195] ([<00000349976f9e2e>] __del_page_from_free_list+0x1be/0x1e0) [ 308.988202] [<00000349976ff946>] rmqueue_bulk+0x706/0x940 [ 308.988208] [<00000349976ffd7e>] __rmqueue_pcplist+0x1fe/0x2a0 [ 308.988214] [<0000034997700966>] rmqueue.isra.0+0xb46/0xf40 [ 308.988221] [<0000034997703ec8>] get_page_from_freelist+0x198/0x8d0 [ 308.988227] [<0000034997706fa8>] __alloc_frozen_pages_noprof+0x198/0x400 [ 308.988233] [<00000349977536f8>] alloc_pages_mpol+0xb8/0x220 [ 308.988240] [<0000034997753bf6>] folio_alloc_mpol_noprof+0x26/0xc0 [ 308.988247] [<0000034997753e4c>] vma_alloc_folio_noprof+0x6c/0xa0 [ 308.988253] [<0000034997775b22>] vma_alloc_anon_folio_pmd+0x42/0x240 [ 308.988260] [<000003499777bfea>] __do_huge_pmd_anonymous_page+0x3a/0x210 [ 308.988267] [<00000349976cb08e>] __handle_mm_fault+0x4de/0x500 [ 308.988273] [<00000349976cb14c>] handle_mm_fault+0x9c/0x3a0 [ 308.988279] [<000003499734d70e>] do_exception+0x1de/0x540 [ 308.988286] [<0000034998387390>] __do_pgm_check+0x130/0x220 [ 308.988293] [<000003499839a934>] pgm_check_handler+0x114/0x160 [ 308.988300] 3 locks held by mempig_verify/5224: [ 308.988305] #0: 0000023ea44c1e08 (vm_lock){++++}-{0:0}, at: lock_vma_under_rcu+0xb2/0x2a0 [ 308.988322] #1: 0000023ee4d41b18 (&pcp->lock){+.+.}-{2:2}, at: rmqueue.isra.0+0xad6/0xf40 [ 308.988334] #2: 0000023efe6c8998 (&zone->lock){..-.}-{2:2}, at: rmqueue_bulk+0x5a/0x940 [ 308.988346] Last Breaking-Event-Address: [ 308.988350] [<0000034997379096>] __warn_printk+0x136/0x140 [ 308.988356] irq event stamp: 52330356 [ 308.988360] hardirqs last enabled at (52330355): [<000003499838742e>] __do_pgm_check+0x1ce/0x220 [ 308.988366] hardirqs last disabled at (52330356): [<000003499839932e>] _raw_spin_lock_irqsave+0x9e/0xe0 [ 308.988373] softirqs last enabled at (52329882): [<0000034997383786>] handle_softirqs+0x2c6/0x530 [ 308.988380] softirqs last disabled at (52329859): [<0000034997382f86>] __irq_exit_rcu+0x126/0x140 [ 308.988388] ---[ end trace 0000000000000000 ]--- Link: https://lkml.kernel.org/r/20251215081002.3353900A9c-agordeev@linux.ibm.com Link: https://lkml.kernel.org/r/20251212151457.3898073Add-agordeev@linux.ibm.com Fixes: e6cf9e1c4cde ("mm: page_alloc: fix up block types when merging compatible blocks") Signed-off-by: Alexander Gordeev Reported-by: Marc Hartmayer Closes: https://lore.kernel.org/linux-mm/87wmalyktd.fsf@linux.ibm.com/ Acked-by: Vlastimil Babka Acked-by: Johannes Weiner Reviewed-by: Wei Yang Cc: Marc Hartmayer Cc: Signed-off-by: Andrew Morton [ adapted context for function removal ] Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 765c890e6a84..9d43bd47da26 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -744,6 +744,17 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, NULL) != NULL; } +static void change_pageblock_range(struct page *pageblock_page, + int start_order, int migratetype) +{ + int nr_pageblocks = 1 << (start_order - pageblock_order); + + while (nr_pageblocks--) { + set_pageblock_migratetype(pageblock_page, migratetype); + pageblock_page += pageblock_nr_pages; + } +} + /* * Freeing function for a buddy system allocator. * @@ -830,7 +841,7 @@ static inline void __free_one_page(struct page *page, * expand() down the line puts the sub-blocks * on the right freelists. */ - set_pageblock_migratetype(buddy, migratetype); + change_pageblock_range(buddy, order, migratetype); } combined_pfn = buddy_pfn & pfn; @@ -1817,17 +1828,6 @@ move: } #endif /* CONFIG_MEMORY_ISOLATION */ -static void change_pageblock_range(struct page *pageblock_page, - int start_order, int migratetype) -{ - int nr_pageblocks = 1 << (start_order - pageblock_order); - - while (nr_pageblocks--) { - set_pageblock_migratetype(pageblock_page, migratetype); - pageblock_page += pageblock_nr_pages; - } -} - /* * When we are falling back to another migratetype during allocation, try to * steal extra free pages from the same pageblocks to satisfy further -- cgit v1.2.3 From 11f66b84fa7ebc6946d39507a7ce15a6b4fc32d0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 6 Jan 2026 18:07:46 -0500 Subject: mm: simplify folio_expected_ref_count() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 78cb1a13c42a6d843e21389f74d1edb90ed07288 ] Now that PAGE_MAPPING_MOVABLE is gone, we can simplify and rely on the folio_test_anon() test only. ... but staring at the users, this function should never even have been called on movable_ops pages. E.g., * __buffer_migrate_folio() does not make sense for them * folio_migrate_mapping() does not make sense for them * migrate_huge_page_move_mapping() does not make sense for them * __migrate_folio() does not make sense for them * ... and khugepaged should never stumble over them Let's simply refuse typed pages (which includes slab) except hugetlb, and WARN. Link: https://lkml.kernel.org/r/20250704102524.326966-26-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Lorenzo Stoakes Reviewed-by: Harry Yoo Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Brendan Jackman Cc: Byungchul Park Cc: Chengming Zhou Cc: Christian Brauner Cc: Christophe Leroy Cc: Eugenio Pé rez Cc: Greg Kroah-Hartman Cc: Gregory Price Cc: "Huang, Ying" Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Wang Cc: Jerrin Shaji George Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joshua Hahn Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Peter Xu Cc: Qi Zheng Cc: Rakie Kim Cc: Rik van Riel Cc: Sergey Senozhatsky Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: xu xin Signed-off-by: Andrew Morton Stable-dep-of: f183663901f2 ("mm: consider non-anon swap cache folios in folio_expected_ref_count()") Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 13b4bd7355c1..bbea39a8d441 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2238,13 +2238,13 @@ static inline int folio_expected_ref_count(const struct folio *folio) const int order = folio_order(folio); int ref_count = 0; - if (WARN_ON_ONCE(folio_test_slab(folio))) + if (WARN_ON_ONCE(page_has_type(&folio->page) && !folio_test_hugetlb(folio))) return 0; if (folio_test_anon(folio)) { /* One reference per page from the swapcache. */ ref_count += folio_test_swapcache(folio) << order; - } else if (!((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS)) { + } else { /* One reference per page from the pagecache. */ ref_count += !!folio->mapping << order; /* One reference from PG_private. */ -- cgit v1.2.3 From 58a32633d124006fdde480364d2d7475a02b8ad4 Mon Sep 17 00:00:00 2001 From: Bijan Tabatabai Date: Tue, 6 Jan 2026 18:07:47 -0500 Subject: mm: consider non-anon swap cache folios in folio_expected_ref_count() [ Upstream commit f183663901f21fe0fba8bd31ae894bc529709ee0 ] Currently, folio_expected_ref_count() only adds references for the swap cache if the folio is anonymous. However, according to the comment above the definition of PG_swapcache in enum pageflags, shmem folios can also have PG_swapcache set. This patch makes sure references for the swap cache are added if folio_test_swapcache(folio) is true. This issue was found when trying to hot-unplug memory in a QEMU/KVM virtual machine. When initiating hot-unplug when most of the guest memory is allocated, hot-unplug hangs partway through removal due to migration failures. The following message would be printed several times, and would be printed again about every five seconds: [ 49.641309] migrating pfn b12f25 failed ret:7 [ 49.641310] page: refcount:2 mapcount:0 mapping:0000000033bd8fe2 index:0x7f404d925 pfn:0xb12f25 [ 49.641311] aops:swap_aops [ 49.641313] flags: 0x300000000030508(uptodate|active|owner_priv_1|reclaim|swapbacked|node=0|zone=3) [ 49.641314] raw: 0300000000030508 ffffed312c4bc908 ffffed312c4bc9c8 0000000000000000 [ 49.641315] raw: 00000007f404d925 00000000000c823b 00000002ffffffff 0000000000000000 [ 49.641315] page dumped because: migration failure When debugging this, I found that these migration failures were due to __migrate_folio() returning -EAGAIN for a small set of folios because the expected reference count it calculates via folio_expected_ref_count() is one less than the actual reference count of the folios. Furthermore, all of the affected folios were not anonymous, but had the PG_swapcache flag set, inspiring this patch. After applying this patch, the memory hot-unplug behaves as expected. I tested this on a machine running Ubuntu 24.04 with kernel version 6.8.0-90-generic and 64GB of memory. The guest VM is managed by libvirt and runs Ubuntu 24.04 with kernel version 6.18 (though the head of the mm-unstable branch as a Dec 16, 2025 was also tested and behaves the same) and 48GB of memory. The libvirt XML definition for the VM can be found at [1]. CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE is set in the guest kernel so the hot-pluggable memory is automatically onlined. Below are the steps to reproduce this behavior: 1) Define and start and virtual machine host$ virsh -c qemu:///system define ./test_vm.xml # test_vm.xml from [1] host$ virsh -c qemu:///system start test_vm 2) Setup swap in the guest guest$ sudo fallocate -l 32G /swapfile guest$ sudo chmod 0600 /swapfile guest$ sudo mkswap /swapfile guest$ sudo swapon /swapfile 3) Use alloc_data [2] to allocate most of the remaining guest memory guest$ ./alloc_data 45 4) In a separate guest terminal, monitor the amount of used memory guest$ watch -n1 free -h 5) When alloc_data has finished allocating, initiate the memory hot-unplug using the provided xml file [3] host$ virsh -c qemu:///system detach-device test_vm ./remove.xml --live After initiating the memory hot-unplug, you should see the amount of available memory in the guest decrease, and the amount of used swap data increase. If everything works as expected, when all of the memory is unplugged, there should be around 8.5-9GB of data in swap. If the unplugging is unsuccessful, the amount of used swap data will settle below that. If that happens, you should be able to see log messages in dmesg similar to the one posted above. Link: https://lkml.kernel.org/r/20251216200727.2360228-1-bijan311@gmail.com Link: https://github.com/BijanT/linux_patch_files/blob/main/test_vm.xml [1] Link: https://github.com/BijanT/linux_patch_files/blob/main/alloc_data.c [2] Link: https://github.com/BijanT/linux_patch_files/blob/main/remove.xml [3] Fixes: 86ebd50224c0 ("mm: add folio_expected_ref_count() for reference count calculation") Signed-off-by: Bijan Tabatabai Acked-by: David Hildenbrand (Red Hat) Acked-by: Zi Yan Reviewed-by: Baolin Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shivank Garg Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Kairui Song Cc: Signed-off-by: Andrew Morton Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- include/linux/mm.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index bbea39a8d441..20f9287d23a5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2241,10 +2241,10 @@ static inline int folio_expected_ref_count(const struct folio *folio) if (WARN_ON_ONCE(page_has_type(&folio->page) && !folio_test_hugetlb(folio))) return 0; - if (folio_test_anon(folio)) { - /* One reference per page from the swapcache. */ - ref_count += folio_test_swapcache(folio) << order; - } else { + /* One reference per page from the swapcache. */ + ref_count += folio_test_swapcache(folio) << order; + + if (!folio_test_anon(folio)) { /* One reference per page from the pagecache. */ ref_count += !!folio->mapping << order; /* One reference from PG_private. */ -- cgit v1.2.3 From 1c7c3a9314d8a7fc0e9a508606466a967c8e774a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 6 Jan 2026 18:07:52 -0500 Subject: mptcp: ensure context reset on disconnect() [ Upstream commit 86730ac255b0497a272704de9a1df559f5d6602e ] After the blamed commit below, if the MPC subflow is already in TCP_CLOSE status or has fallback to TCP at mptcp_disconnect() time, mptcp_do_fastclose() skips setting the `send_fastclose flag` and the later __mptcp_close_ssk() does not reset anymore the related subflow context. Any later connection will be created with both the `request_mptcp` flag and the msk-level fallback status off (it is unconditionally cleared at MPTCP disconnect time), leading to a warning in subflow_data_ready(): WARNING: CPU: 26 PID: 8996 at net/mptcp/subflow.c:1519 subflow_data_ready (net/mptcp/subflow.c:1519 (discriminator 13)) Modules linked in: CPU: 26 UID: 0 PID: 8996 Comm: syz.22.39 Not tainted 6.18.0-rc7-05427-g11fc074f6c36 #1 PREEMPT(voluntary) Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 RIP: 0010:subflow_data_ready (net/mptcp/subflow.c:1519 (discriminator 13)) Code: 90 0f 0b 90 90 e9 04 fe ff ff e8 b7 1e f5 fe 89 ee bf 07 00 00 00 e8 db 19 f5 fe 83 fd 07 0f 84 35 ff ff ff e8 9d 1e f5 fe 90 <0f> 0b 90 e9 27 ff ff ff e8 8f 1e f5 fe 4c 89 e7 48 89 de e8 14 09 RSP: 0018:ffffc9002646fb30 EFLAGS: 00010293 RAX: 0000000000000000 RBX: ffff88813b218000 RCX: ffffffff825c8435 RDX: ffff8881300b3580 RSI: ffffffff825c8443 RDI: 0000000000000005 RBP: 000000000000000b R08: ffffffff825c8435 R09: 000000000000000b R10: 0000000000000005 R11: 0000000000000007 R12: ffff888131ac0000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f88330af6c0(0000) GS:ffff888a93dd2000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f88330aefe8 CR3: 000000010ff59000 CR4: 0000000000350ef0 Call Trace: tcp_data_ready (net/ipv4/tcp_input.c:5356) tcp_data_queue (net/ipv4/tcp_input.c:5445) tcp_rcv_state_process (net/ipv4/tcp_input.c:7165) tcp_v4_do_rcv (net/ipv4/tcp_ipv4.c:1955) __release_sock (include/net/sock.h:1158 (discriminator 6) net/core/sock.c:3180 (discriminator 6)) release_sock (net/core/sock.c:3737) mptcp_sendmsg (net/mptcp/protocol.c:1763 net/mptcp/protocol.c:1857) inet_sendmsg (net/ipv4/af_inet.c:853 (discriminator 7)) __sys_sendto (net/socket.c:727 (discriminator 15) net/socket.c:742 (discriminator 15) net/socket.c:2244 (discriminator 15)) __x64_sys_sendto (net/socket.c:2247) do_syscall_64 (arch/x86/entry/syscall_64.c:63 (discriminator 1) arch/x86/entry/syscall_64.c:94 (discriminator 1)) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) RIP: 0033:0x7f883326702d Address the issue setting an explicit `fastclosing` flag at fastclose time, and checking such flag after mptcp_do_fastclose(). Fixes: ae155060247b ("mptcp: fix duplicate reset on fastclose") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251212-net-mptcp-subflow_data_ready-warn-v1-2-d1f9fd1c36c8@kernel.org Signed-off-by: Paolo Abeni [ Adjust context ] Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/mptcp/protocol.c | 8 +++++--- net/mptcp/protocol.h | 3 ++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index aab3c96ecd1c..790feade9bf2 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2478,10 +2478,10 @@ bool __mptcp_retransmit_pending_data(struct sock *sk) */ static void __mptcp_subflow_disconnect(struct sock *ssk, struct mptcp_subflow_context *subflow, - unsigned int flags) + bool fastclosing) { if (((1 << ssk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || - subflow->send_fastclose) { + fastclosing) { /* The MPTCP code never wait on the subflow sockets, TCP-level * disconnect should never fail */ @@ -2533,7 +2533,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); if (!dispose_it) { - __mptcp_subflow_disconnect(ssk, subflow, flags); + __mptcp_subflow_disconnect(ssk, subflow, msk->fastclosing); release_sock(ssk); goto out; @@ -2845,6 +2845,7 @@ static void mptcp_do_fastclose(struct sock *sk) struct mptcp_sock *msk = mptcp_sk(sk); mptcp_set_state(sk, TCP_CLOSE); + msk->fastclosing = 1; /* Explicitly send the fastclose reset as need */ if (__mptcp_check_fallback(msk)) @@ -3362,6 +3363,7 @@ static int mptcp_disconnect(struct sock *sk, int flags) msk->bytes_sent = 0; msk->bytes_retrans = 0; msk->rcvspace_init = 0; + msk->fastclosing = 0; WRITE_ONCE(sk->sk_shutdown, 0); sk_error_report(sk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index fcc3ef246166..bdec5ad9defb 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -316,7 +316,8 @@ struct mptcp_sock { fastopening:1, in_accept_queue:1, free_first:1, - rcvspace_init:1; + rcvspace_init:1, + fastclosing:1; u32 notsent_lowat; int keepalive_cnt; int keepalive_idle; -- cgit v1.2.3 From 7b240a8935d554ad36a52c2c37c32039f9afaef2 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Tue, 6 Jan 2026 18:08:39 -0500 Subject: wifi: mac80211: Discard Beacon frames to non-broadcast address [ Upstream commit 193d18f60588e95d62e0f82b6a53893e5f2f19f8 ] Beacon frames are required to be sent to the broadcast address, see IEEE Std 802.11-2020, 11.1.3.1 ("The Address 1 field of the Beacon .. frame shall be set to the broadcast address"). A unicast Beacon frame might be used as a targeted attack to get one of the associated STAs to do something (e.g., using CSA to move it to another channel). As such, it is better have strict filtering for this on the received side and discard all Beacon frames that are sent to an unexpected address. This is even more important for cases where beacon protection is used. The current implementation in mac80211 is correctly discarding unicast Beacon frames if the Protected Frame bit in the Frame Control field is set to 0. However, if that bit is set to 1, the logic used for checking for configured BIGTK(s) does not actually work. If the driver does not have logic for dropping unicast Beacon frames with Protected Frame bit 1, these frames would be accepted in mac80211 processing as valid Beacon frames even though they are not protected. This would allow beacon protection to be bypassed. While the logic for checking beacon protection could be extended to cover this corner case, a more generic check for discard all Beacon frames based on A1=unicast address covers this without needing additional changes. Address all these issues by dropping received Beacon frames if they are sent to a non-broadcast address. Cc: stable@vger.kernel.org Fixes: af2d14b01c32 ("mac80211: Beacon protection using the new BIGTK (STA)") Signed-off-by: Jouni Malinen Link: https://patch.msgid.link/20251215151134.104501-1-jouni.malinen@oss.qualcomm.com Signed-off-by: Johannes Berg [ changed RX_DROP to RX_DROP_MONITOR ] Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/mac80211/rx.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index ea6fe21c96c5..e4a3ce716f6b 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3426,6 +3426,11 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) rx->skb->len < IEEE80211_MIN_ACTION_SIZE) return RX_DROP_U_RUNT_ACTION; + /* Drop non-broadcast Beacon frames */ + if (ieee80211_is_beacon(mgmt->frame_control) && + !is_broadcast_ether_addr(mgmt->da)) + return RX_DROP_MONITOR; + if (rx->sdata->vif.type == NL80211_IFTYPE_AP && ieee80211_is_beacon(mgmt->frame_control) && !(rx->flags & IEEE80211_RX_BEACON_REPORTED)) { -- cgit v1.2.3 From 585dbb5cdbb8456f7a248b2c15951f9784dcdeb7 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Tue, 6 Jan 2026 20:03:14 -0500 Subject: net: phy: mediatek: fix nvmem cell reference leak in mt798x_phy_calibration [ Upstream commit 1e5a541420b8c6d87d88eb50b6b978cdeafee1c9 ] When nvmem_cell_read() fails in mt798x_phy_calibration(), the function returns without calling nvmem_cell_put(), leaking the cell reference. Move nvmem_cell_put() right after nvmem_cell_read() to ensure the cell reference is always released regardless of the read result. Found via static analysis and code review. Fixes: 98c485eaf509 ("net: phy: add driver for MediaTek SoC built-in GE PHYs") Cc: stable@vger.kernel.org Signed-off-by: Miaoqian Lin Reviewed-by: Daniel Golle Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20251211081313.2368460-1-linmq006@gmail.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/mediatek-ge-soc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/mediatek-ge-soc.c b/drivers/net/phy/mediatek-ge-soc.c index f4f9412d0cd7..4b2a9a5444c5 100644 --- a/drivers/net/phy/mediatek-ge-soc.c +++ b/drivers/net/phy/mediatek-ge-soc.c @@ -1082,9 +1082,9 @@ static int mt798x_phy_calibration(struct phy_device *phydev) } buf = (u32 *)nvmem_cell_read(cell, &len); + nvmem_cell_put(cell); if (IS_ERR(buf)) return PTR_ERR(buf); - nvmem_cell_put(cell); if (!buf[0] || !buf[1] || !buf[2] || !buf[3] || len < 4 * sizeof(u32)) { phydev_err(phydev, "invalid efuse data\n"); -- cgit v1.2.3 From c4d18e9540bf20f6c53ecc501b6702d141e8bda5 Mon Sep 17 00:00:00 2001 From: Natalie Vock Date: Wed, 7 Jan 2026 05:53:17 -0500 Subject: drm/amdgpu: Forward VMID reservation errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 8defb4f081a5feccc3ea8372d0c7af3522124e1f ] Otherwise userspace may be fooled into believing it has a reserved VMID when in reality it doesn't, ultimately leading to GPU hangs when SPM is used. Fixes: 80e709ee6ecc ("drm/amdgpu: add option params to enforce process isolation between graphics and compute") Cc: stable@vger.kernel.org Reviewed-by: Christian König Signed-off-by: Natalie Vock Signed-off-by: Alex Deucher [ adapted 3-argument amdgpu_vmid_alloc_reserved(adev, vm, vmhub) call to 2-argument version and added separate error check to preserve reserved_vmid tracking logic. ] Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 37d53578825b..211d67a2e48d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2747,10 +2747,12 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) case AMDGPU_VM_OP_RESERVE_VMID: /* We only have requirement to reserve vmid from gfxhub */ if (!fpriv->vm.reserved_vmid[AMDGPU_GFXHUB(0)]) { - amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(0)); + int r = amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(0)); + + if (r) + return r; fpriv->vm.reserved_vmid[AMDGPU_GFXHUB(0)] = true; } - break; case AMDGPU_VM_OP_UNRESERVE_VMID: if (fpriv->vm.reserved_vmid[AMDGPU_GFXHUB(0)]) { -- cgit v1.2.3 From b03136582acb583fc5d4bf5bff3808f63fcec7d4 Mon Sep 17 00:00:00 2001 From: Richa Bharti Date: Wed, 7 Jan 2026 17:19:38 +0530 Subject: cpufreq: intel_pstate: Check IDA only before MSR_IA32_PERF_CTL writes [ Upstream commit 4b747cc628d8f500d56cf1338280eacc66362ff3 ] Commit ac4e04d9e378 ("cpufreq: intel_pstate: Unchecked MSR aceess in legacy mode") introduced a check for feature X86_FEATURE_IDA to verify turbo mode support. Although this is the correct way to check for turbo mode support, it causes issues on some platforms that disable turbo during OS boot, but enable it later [1]. Before adding this feature check, users were able to get turbo mode frequencies by writing 0 to /sys/devices/system/cpu/intel_pstate/no_turbo post-boot. To restore the old behavior on the affected systems while still addressing the unchecked MSR issue on some Skylake-X systems, check X86_FEATURE_IDA only immediately before updates of MSR_IA32_PERF_CTL that may involve setting the Turbo Engage Bit (bit 32). Fixes: ac4e04d9e378 ("cpufreq: intel_pstate: Unchecked MSR aceess in legacy mode") Reported-by: Aaron Rainbolt Closes: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/2122531 [1] Tested-by: Aaron Rainbolt Signed-off-by: Srinivas Pandruvada [ rjw: Subject adjustment, changelog edits ] Link: https://patch.msgid.link/20251111010840.141490-1-srinivas.pandruvada@linux.intel.com Signed-off-by: Rafael J. Wysocki [ richa: Backport to 6.12.y with context adjustments ] Signed-off-by: Richa Bharti Signed-off-by: Greg Kroah-Hartman --- drivers/cpufreq/intel_pstate.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index d0f4f7c2ae4d..9d8cb44c26c7 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -600,9 +600,6 @@ static bool turbo_is_disabled(void) { u64 misc_en; - if (!cpu_feature_enabled(X86_FEATURE_IDA)) - return true; - rdmsrl(MSR_IA32_MISC_ENABLE, misc_en); return !!(misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); @@ -2018,7 +2015,8 @@ static u64 atom_get_val(struct cpudata *cpudata, int pstate) u32 vid; val = (u64)pstate << 8; - if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled)) + if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled) && + cpu_feature_enabled(X86_FEATURE_IDA)) val |= (u64)1 << 32; vid_fp = cpudata->vid.min + mul_fp( @@ -2183,7 +2181,8 @@ static u64 core_get_val(struct cpudata *cpudata, int pstate) u64 val; val = (u64)pstate << 8; - if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled)) + if (READ_ONCE(global.no_turbo) && !READ_ONCE(global.turbo_disabled) && + cpu_feature_enabled(X86_FEATURE_IDA)) val |= (u64)1 << 32; return val; -- cgit v1.2.3 From 4888e1dcc341e9a132ef7b8516234b3c3296de56 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Wed, 7 Jan 2026 14:19:50 -0300 Subject: net: Remove RTNL dance for SIOCBRADDIF and SIOCBRDELIF. commit ed3ba9b6e280e14cc3148c1b226ba453f02fa76c upstream. SIOCBRDELIF is passed to dev_ioctl() first and later forwarded to br_ioctl_call(), which causes unnecessary RTNL dance and the splat below [0] under RTNL pressure. Let's say Thread A is trying to detach a device from a bridge and Thread B is trying to remove the bridge. In dev_ioctl(), Thread A bumps the bridge device's refcnt by netdev_hold() and releases RTNL because the following br_ioctl_call() also re-acquires RTNL. In the race window, Thread B could acquire RTNL and try to remove the bridge device. Then, rtnl_unlock() by Thread B will release RTNL and wait for netdev_put() by Thread A. Thread A, however, must hold RTNL after the unlock in dev_ifsioc(), which may take long under RTNL pressure, resulting in the splat by Thread B. Thread A (SIOCBRDELIF) Thread B (SIOCBRDELBR) ---------------------- ---------------------- sock_ioctl sock_ioctl `- sock_do_ioctl `- br_ioctl_call `- dev_ioctl `- br_ioctl_stub |- rtnl_lock | |- dev_ifsioc ' ' |- dev = __dev_get_by_name(...) |- netdev_hold(dev, ...) . / |- rtnl_unlock ------. | | |- br_ioctl_call `---> |- rtnl_lock Race | | `- br_ioctl_stub |- br_del_bridge Window | | | |- dev = __dev_get_by_name(...) | | | May take long | `- br_dev_delete(dev, ...) | | | under RTNL pressure | `- unregister_netdevice_queue(dev, ...) | | | | `- rtnl_unlock \ | |- rtnl_lock <-' `- netdev_run_todo | |- ... `- netdev_run_todo | `- rtnl_unlock |- __rtnl_unlock | |- netdev_wait_allrefs_any |- netdev_put(dev, ...) <----------------' Wait refcnt decrement and log splat below To avoid blocking SIOCBRDELBR unnecessarily, let's not call dev_ioctl() for SIOCBRADDIF and SIOCBRDELIF. In the dev_ioctl() path, we do the following: 1. Copy struct ifreq by get_user_ifreq in sock_do_ioctl() 2. Check CAP_NET_ADMIN in dev_ioctl() 3. Call dev_load() in dev_ioctl() 4. Fetch the master dev from ifr.ifr_name in dev_ifsioc() 3. can be done by request_module() in br_ioctl_call(), so we move 1., 2., and 4. to br_ioctl_stub(). Note that 2. is also checked later in add_del_if(), but it's better performed before RTNL. SIOCBRADDIF and SIOCBRDELIF have been processed in dev_ioctl() since the pre-git era, and there seems to be no specific reason to process them there. [0]: unregister_netdevice: waiting for wpan3 to become free. Usage count = 2 ref_tracker: wpan3@ffff8880662d8608 has 1/1 users at __netdev_tracker_alloc include/linux/netdevice.h:4282 [inline] netdev_hold include/linux/netdevice.h:4311 [inline] dev_ifsioc+0xc6a/0x1160 net/core/dev_ioctl.c:624 dev_ioctl+0x255/0x10c0 net/core/dev_ioctl.c:826 sock_do_ioctl+0x1ca/0x260 net/socket.c:1213 sock_ioctl+0x23a/0x6c0 net/socket.c:1318 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl fs/ioctl.c:892 [inline] __x64_sys_ioctl+0x1a4/0x210 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcb/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 893b19587534 ("net: bridge: fix ioctl locking") Reported-by: syzkaller Reported-by: yan kang Reported-by: yue sun Closes: https://lore.kernel.org/netdev/SY8P300MB0421225D54EB92762AE8F0F2A1D32@SY8P300MB0421.AUSP300.PROD.OUTLOOK.COM/ Signed-off-by: Kuniyuki Iwashima Acked-by: Stanislav Fomichev Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250316192851.19781-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni [cascardo: fixed conflict at dev_ifsioc] Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Greg Kroah-Hartman --- include/linux/if_bridge.h | 6 ++---- net/bridge/br_ioctl.c | 36 +++++++++++++++++++++++++++++++++--- net/bridge/br_private.h | 3 +-- net/core/dev_ioctl.c | 16 ---------------- net/socket.c | 19 +++++++++---------- 5 files changed, 45 insertions(+), 35 deletions(-) diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 3ff96ae31bf6..c5fe3b2a53e8 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -65,11 +65,9 @@ struct br_ip_list { #define BR_DEFAULT_AGEING_TIME (300 * HZ) struct net_bridge; -void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br, - unsigned int cmd, struct ifreq *ifr, +void brioctl_set(int (*hook)(struct net *net, unsigned int cmd, void __user *uarg)); -int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd, - struct ifreq *ifr, void __user *uarg); +int br_ioctl_call(struct net *net, unsigned int cmd, void __user *uarg); #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING) int br_multicast_list_adjacent(struct net_device *dev, diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index f213ed108361..6bc0a11f2ed3 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -394,10 +394,26 @@ static int old_deviceless(struct net *net, void __user *data) return -EOPNOTSUPP; } -int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd, - struct ifreq *ifr, void __user *uarg) +int br_ioctl_stub(struct net *net, unsigned int cmd, void __user *uarg) { int ret = -EOPNOTSUPP; + struct ifreq ifr; + + if (cmd == SIOCBRADDIF || cmd == SIOCBRDELIF) { + void __user *data; + char *colon; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if (get_user_ifreq(&ifr, &data, uarg)) + return -EFAULT; + + ifr.ifr_name[IFNAMSIZ - 1] = 0; + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; + } rtnl_lock(); @@ -430,7 +446,21 @@ int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd, break; case SIOCBRADDIF: case SIOCBRDELIF: - ret = add_del_if(br, ifr->ifr_ifindex, cmd == SIOCBRADDIF); + { + struct net_device *dev; + + dev = __dev_get_by_name(net, ifr.ifr_name); + if (!dev || !netif_device_present(dev)) { + ret = -ENODEV; + break; + } + if (!netif_is_bridge_master(dev)) { + ret = -EOPNOTSUPP; + break; + } + + ret = add_del_if(netdev_priv(dev), ifr.ifr_ifindex, cmd == SIOCBRADDIF); + } break; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index a2e59108a5dc..b2e4e2d04f02 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -953,8 +953,7 @@ br_port_get_check_rtnl(const struct net_device *dev) /* br_ioctl.c */ int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq, void __user *data, int cmd); -int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd, - struct ifreq *ifr, void __user *uarg); +int br_ioctl_stub(struct net *net, unsigned int cmd, void __user *uarg); /* br_multicast.c */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 473c437b6b53..81cd8df798c0 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -514,7 +514,6 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, int err; struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); const struct net_device_ops *ops; - netdevice_tracker dev_tracker; if (!dev) return -ENODEV; @@ -577,19 +576,6 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, case SIOCWANDEV: return dev_siocwandev(dev, &ifr->ifr_settings); - case SIOCBRADDIF: - case SIOCBRDELIF: - if (!netif_device_present(dev)) - return -ENODEV; - if (!netif_is_bridge_master(dev)) - return -EOPNOTSUPP; - netdev_hold(dev, &dev_tracker, GFP_KERNEL); - rtnl_unlock(); - err = br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL); - netdev_put(dev, &dev_tracker); - rtnl_lock(); - return err; - case SIOCDEVPRIVATE ... SIOCDEVPRIVATE + 15: return dev_siocdevprivate(dev, ifr, data, cmd); @@ -770,8 +756,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: - case SIOCBRADDIF: - case SIOCBRDELIF: case SIOCSHWTSTAMP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; diff --git a/net/socket.c b/net/socket.c index 042451f01c65..a0f6f8b3376d 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1173,12 +1173,10 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) */ static DEFINE_MUTEX(br_ioctl_mutex); -static int (*br_ioctl_hook)(struct net *net, struct net_bridge *br, - unsigned int cmd, struct ifreq *ifr, +static int (*br_ioctl_hook)(struct net *net, unsigned int cmd, void __user *uarg); -void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br, - unsigned int cmd, struct ifreq *ifr, +void brioctl_set(int (*hook)(struct net *net, unsigned int cmd, void __user *uarg)) { mutex_lock(&br_ioctl_mutex); @@ -1187,8 +1185,7 @@ void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br, } EXPORT_SYMBOL(brioctl_set); -int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd, - struct ifreq *ifr, void __user *uarg) +int br_ioctl_call(struct net *net, unsigned int cmd, void __user *uarg) { int err = -ENOPKG; @@ -1197,7 +1194,7 @@ int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd, mutex_lock(&br_ioctl_mutex); if (br_ioctl_hook) - err = br_ioctl_hook(net, br, cmd, ifr, uarg); + err = br_ioctl_hook(net, cmd, uarg); mutex_unlock(&br_ioctl_mutex); return err; @@ -1297,7 +1294,9 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) case SIOCSIFBR: case SIOCBRADDBR: case SIOCBRDELBR: - err = br_ioctl_call(net, NULL, cmd, NULL, argp); + case SIOCBRADDIF: + case SIOCBRDELIF: + err = br_ioctl_call(net, cmd, argp); break; case SIOCGIFVLAN: case SIOCSIFVLAN: @@ -3466,6 +3465,8 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCGPGRP: case SIOCBRADDBR: case SIOCBRDELBR: + case SIOCBRADDIF: + case SIOCBRDELIF: case SIOCGIFVLAN: case SIOCSIFVLAN: case SIOCGSKNS: @@ -3505,8 +3506,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCGIFPFLAGS: case SIOCGIFTXQLEN: case SIOCSIFTXQLEN: - case SIOCBRADDIF: - case SIOCBRDELIF: case SIOCGIFNAME: case SIOCSIFNAME: case SIOCGMIIPHY: -- cgit v1.2.3 From 52aa889c6f57f00a5743bbc71a2e23d0660d5e2b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Nov 2025 17:01:24 +0100 Subject: sched/fair: Small cleanup to sched_balance_newidle() commit e78e70dbf603c1425f15f32b455ca148c932f6c1 upstream. Pull out the !sd check to simplify code. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Tested-by: Chris Mason Link: https://patch.msgid.link/20251107161739.525916173@infradead.org [ Ajay: Modified to apply on v6.12 ] Signed-off-by: Ajay Kaher Signed-off-by: Greg Kroah-Hartman --- kernel/sched/fair.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 22dc54aab8dd..8a46cac2f7f8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12864,14 +12864,16 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); + if (!sd) { + rcu_read_unlock(); + goto out; + } if (!get_rd_overloaded(this_rq->rd) || - (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { + this_rq->avg_idle < sd->max_newidle_lb_cost) { - if (sd) - update_next_balance(sd, &next_balance); + update_next_balance(sd, &next_balance); rcu_read_unlock(); - goto out; } rcu_read_unlock(); -- cgit v1.2.3 From c6ae271bc5fd44b48ebe43f63c4d2ae972d296da Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Nov 2025 17:01:27 +0100 Subject: sched/fair: Small cleanup to update_newidle_cost() commit 08d473dd8718e4a4d698b1113a14a40ad64a909b upstream. Simplify code by adding a few variables. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Tested-by: Chris Mason Link: https://patch.msgid.link/20251107161739.655208666@infradead.org [ Ajay: Modified to apply on v6.12 ] Signed-off-by: Ajay Kaher Signed-off-by: Greg Kroah-Hartman --- kernel/sched/fair.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8a46cac2f7f8..a801ba7341ec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12188,22 +12188,25 @@ void update_max_interval(void) static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) { + unsigned long next_decay = sd->last_decay_max_lb_cost + HZ; + unsigned long now = jiffies; + if (cost > sd->max_newidle_lb_cost) { /* * Track max cost of a domain to make sure to not delay the * next wakeup on the CPU. */ sd->max_newidle_lb_cost = cost; - sd->last_decay_max_lb_cost = jiffies; - } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { + sd->last_decay_max_lb_cost = now; + + } else if (time_after(now, next_decay)) { /* * Decay the newidle max times by ~1% per second to ensure that * it is not outdated and the current max cost is actually * shorter. */ sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256; - sd->last_decay_max_lb_cost = jiffies; - + sd->last_decay_max_lb_cost = now; return true; } -- cgit v1.2.3 From 1b9c118fe318a43aed0b8a516b23817cd3623cf8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Nov 2025 17:01:31 +0100 Subject: sched/fair: Proportional newidle balance commit 33cf66d88306663d16e4759e9d24766b0aaa2e17 upstream. Add a randomized algorithm that runs newidle balancing proportional to its success rate. This improves schbench significantly: 6.18-rc4: 2.22 Mrps/s 6.18-rc4+revert: 2.04 Mrps/s 6.18-rc4+revert+random: 2.18 Mrps/S Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%: 6.17: -6% 6.17+revert: 0% 6.17+revert+random: -1% Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Tested-by: Chris Mason Link: https://lkml.kernel.org/r/6825c50d-7fa7-45d8-9b81-c6e7e25738e2@meta.com Link: https://patch.msgid.link/20251107161739.770122091@infradead.org [ Ajay: Modified to apply on v6.12 ] Signed-off-by: Ajay Kaher Signed-off-by: Greg Kroah-Hartman --- include/linux/sched/topology.h | 3 +++ kernel/sched/core.c | 3 +++ kernel/sched/fair.c | 44 ++++++++++++++++++++++++++++++++++++++---- kernel/sched/features.h | 5 +++++ kernel/sched/sched.h | 7 +++++++ kernel/sched/topology.c | 6 ++++++ 6 files changed, 64 insertions(+), 4 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 4237daa5ac7a..3cf27591fe90 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -106,6 +106,9 @@ struct sched_domain { unsigned int nr_balance_failed; /* initialise to 0 */ /* idle_balance() stats */ + unsigned int newidle_call; + unsigned int newidle_success; + unsigned int newidle_ratio; u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4b1953b6c76a..b1895b330ff0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -118,6 +118,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +DEFINE_PER_CPU(struct rnd_state, sched_rnd_state); #ifdef CONFIG_SCHED_DEBUG /* @@ -8335,6 +8336,8 @@ void __init sched_init_smp(void) { sched_init_numa(NUMA_NO_NODE); + prandom_init_once(&sched_rnd_state); + /* * There's no userspace yet to cause hotplug operations; hence all the * CPU masks are stable and all blatant races in the below code cannot diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a801ba7341ec..1436d6bb86ec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12186,11 +12186,27 @@ void update_max_interval(void) max_load_balance_interval = HZ*num_online_cpus()/10; } -static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) +static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success) +{ + sd->newidle_call++; + sd->newidle_success += success; + + if (sd->newidle_call >= 1024) { + sd->newidle_ratio = sd->newidle_success; + sd->newidle_call /= 2; + sd->newidle_success /= 2; + } +} + +static inline bool +update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success) { unsigned long next_decay = sd->last_decay_max_lb_cost + HZ; unsigned long now = jiffies; + if (cost) + update_newidle_stats(sd, success); + if (cost > sd->max_newidle_lb_cost) { /* * Track max cost of a domain to make sure to not delay the @@ -12238,7 +12254,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) * Decay the newidle max times here because this is a regular * visit to all the domains. */ - need_decay = update_newidle_cost(sd, 0); + need_decay = update_newidle_cost(sd, 0, 0); max_cost += sd->max_newidle_lb_cost; /* @@ -12896,6 +12912,22 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) break; if (sd->flags & SD_BALANCE_NEWIDLE) { + unsigned int weight = 1; + + if (sched_feat(NI_RANDOM)) { + /* + * Throw a 1k sided dice; and only run + * newidle_balance according to the success + * rate. + */ + u32 d1k = sched_rng() % 1024; + weight = 1 + sd->newidle_ratio; + if (d1k > weight) { + update_newidle_stats(sd, 0); + continue; + } + weight = (1024 + weight/2) / weight; + } pulled_task = sched_balance_rq(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, @@ -12903,10 +12935,14 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) t1 = sched_clock_cpu(this_cpu); domain_cost = t1 - t0; - update_newidle_cost(sd, domain_cost); - curr_cost += domain_cost; t0 = t1; + + /* + * Track max cost of a domain to make sure to not delay the + * next wakeup on the CPU. + */ + update_newidle_cost(sd, domain_cost, weight * !!pulled_task); } /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 050d7503064e..da8ec0c23f25 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -122,3 +122,8 @@ SCHED_FEAT(WA_BIAS, true) SCHED_FEAT(UTIL_EST, true) SCHED_FEAT(LATENCY_WARN, false) + +/* + * Do newidle balancing proportional to its success rate using randomization. + */ +SCHED_FEAT(NI_RANDOM, true) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6070331772ea..62f90dcb10a1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -5,6 +5,7 @@ #ifndef _KERNEL_SCHED_SCHED_H #define _KERNEL_SCHED_SCHED_H +#include #include #include #include @@ -1348,6 +1349,12 @@ static inline bool is_migration_disabled(struct task_struct *p) } DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +DECLARE_PER_CPU(struct rnd_state, sched_rnd_state); + +static inline u32 sched_rng(void) +{ + return prandom_u32_state(this_cpu_ptr(&sched_rnd_state)); +} #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() this_cpu_ptr(&runqueues) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 4bd825c24e26..bd8b2b301570 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1632,6 +1632,12 @@ sd_init(struct sched_domain_topology_level *tl, .last_balance = jiffies, .balance_interval = sd_weight, + + /* 50% success rate */ + .newidle_call = 512, + .newidle_success = 256, + .newidle_ratio = 512, + .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, .child = child, -- cgit v1.2.3 From 828b59fdf8ef22ab2d59ce349d16231723692b50 Mon Sep 17 00:00:00 2001 From: Maximilian Immanuel Brandtner Date: Mon, 24 Mar 2025 15:42:46 +0100 Subject: virtio_console: fix order of fields cols and rows commit 5326ab737a47278dbd16ed3ee7380b26c7056ddd upstream. According to section 5.3.6.2 (Multiport Device Operation) of the virtio spec(version 1.2) a control buffer with the event VIRTIO_CONSOLE_RESIZE is followed by a virtio_console_resize struct containing cols then rows. The kernel implements this the wrong way around (rows then cols) resulting in the two values being swapped. Signed-off-by: Maximilian Immanuel Brandtner Message-Id: <20250324144300.905535-1-maxbr@linux.ibm.com> Signed-off-by: Michael S. Tsirkin Cc: Filip Hejsek Signed-off-by: Greg Kroah-Hartman --- drivers/char/virtio_console.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index abcfdd3c2918..8c4e2b1e3de0 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -1579,8 +1579,8 @@ static void handle_control_message(struct virtio_device *vdev, break; case VIRTIO_CONSOLE_RESIZE: { struct { - __virtio16 rows; __virtio16 cols; + __virtio16 rows; } size; if (!is_console_port(port)) -- cgit v1.2.3 From 69f542a54578a2854f6f2b5e87eec2a1504a008d Mon Sep 17 00:00:00 2001 From: Sean Nyekjaer Date: Thu, 8 Jan 2026 13:45:23 +0100 Subject: pwm: stm32: Always program polarity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 7346e7a058a2 ("pwm: stm32: Always do lazy disabling") triggered a regression where PWM polarity changes could be ignored. stm32_pwm_set_polarity() was skipped due to a mismatch between the cached pwm->state.polarity and the actual hardware state, leaving the hardware polarity unchanged. Fixes: 7edf7369205b ("pwm: Add driver for STM32 plaftorm") Cc: stable@vger.kernel.org # <= 6.12 Signed-off-by: Sean Nyekjaer Co-developed-by: Uwe Kleine-König Signed-off-by: Uwe Kleine-König --- drivers/pwm/pwm-stm32.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c index 4f231f8aae7d..778346039ded 100644 --- a/drivers/pwm/pwm-stm32.c +++ b/drivers/pwm/pwm-stm32.c @@ -458,8 +458,7 @@ static int stm32_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, return 0; } - if (state->polarity != pwm->state.polarity) - stm32_pwm_set_polarity(priv, pwm->hwpwm, state->polarity); + stm32_pwm_set_polarity(priv, pwm->hwpwm, state->polarity); ret = stm32_pwm_config(priv, pwm->hwpwm, state->duty_cycle, state->period); -- cgit v1.2.3 From b00d41629d81f2f2da92cbbc6f89f844613d7621 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 9 Jan 2026 11:48:34 +0100 Subject: Revert "iommu/amd: Skip enabling command/event buffers for kdump" This reverts commit 3344716ddee01d7caa35b470d30fd87781c661b1 which is commit 9be15fbfc6c5c89c22cf6e209f66ea43ee0e58bb upstream. This causes problems in older kernel trees as SNP host kdump is not supported in them, so drop it from the stable branches. Reported-by: Ashish Kalra Link: https://lore.kernel.org/r/dacdff7f-0606-4ed5-b056-2de564404d51@amd.com Cc: Vasant Hegde Cc: Sairaj Kodilkar Cc: Joerg Roedel Cc: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/amd/init.c | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 8659cb0bc7e6..e1816ae8699d 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -816,16 +816,11 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu) BUG_ON(iommu->cmd_buf == NULL); - if (!is_kdump_kernel()) { - /* - * Command buffer is re-used for kdump kernel and setting - * of MMIO register is not required. - */ - entry = iommu_virt_to_phys(iommu->cmd_buf); - entry |= MMIO_CMD_SIZE_512; - memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, - &entry, sizeof(entry)); - } + entry = iommu_virt_to_phys(iommu->cmd_buf); + entry |= MMIO_CMD_SIZE_512; + + memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, + &entry, sizeof(entry)); amd_iommu_reset_cmd_buffer(iommu); } @@ -874,15 +869,10 @@ static void iommu_enable_event_buffer(struct amd_iommu *iommu) BUG_ON(iommu->evt_buf == NULL); - if (!is_kdump_kernel()) { - /* - * Event buffer is re-used for kdump kernel and setting - * of MMIO register is not required. - */ - entry = iommu_virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; - memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, - &entry, sizeof(entry)); - } + entry = iommu_virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; + + memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, + &entry, sizeof(entry)); /* set head and tail to zero manually */ writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); -- cgit v1.2.3 From 39cb076c7dc7e44e3cab5c82ffda16a550ed8436 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 11 Jan 2026 15:25:22 +0100 Subject: Linux 6.12.65 Link: https://lore.kernel.org/r/20260109112133.973195406@linuxfoundation.org Tested-by: Brett A C Sheffield Tested-by: Florian Fainelli Tested-by: Slade Watkins Tested-by: Shuah Khan Tested-by: Peter Schneider Tested-by: Ron Economos Tested-by: Francesco Dolcini Tested-by: Jeffrin Jose T Tested-by: Jon Hunter Tested-by: Mark Brown Tested-by: Miguel Ojeda Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index e8e272e1187b..ab23f9796ac4 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 PATCHLEVEL = 12 -SUBLEVEL = 64 +SUBLEVEL = 65 EXTRAVERSION = NAME = Baby Opossum Posse -- cgit v1.2.3