From 02d61d3370ef34ca2a15190a7719d0761ed3ed34 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Thu, 7 May 2026 22:08:34 +0200 Subject: KVM: s390: Fix S390_USER_OPEREXEC enablement without STFLE 74 The KVM_CAP_S390_USER_OPEREXEC capability allows operation exceptions to be forwarded to userspace. But the actual enablement at the hardware level occurs in kvm_arch_vcpu_postcreate(), and only if STFLE.74 or user_instr0 are enabled. The latter is associated with a separate capability (KVM_CAP_S390_USER_INSTR0), so the only way this happens for the USER_OPEREXEC capability is if STFLE.74 is enabled. KVM unconditionally enables this bit in kvm_arch_init_vm(), but the guest could disable it from the CPU model and thus ignore this capability. Add USER_OPEREXEC to the check in kvm_arch_vcpu_postcreate(), such that either capability would enable this type of exception. Fixes: 8e8678e740ec ("KVM: s390: Add capability that forwards operation exceptions") Reviewed-by: Claudio Imbrenda Signed-off-by: Eric Farman Reviewed-by: Janosch Frank Reviewed-by: Christian Borntraeger [Fixed patch title, as recommended by frankja@linux.ibm.com] Signed-off-by: Claudio Imbrenda Message-ID: <20260507200836.3500368-2-farman@linux.ibm.com> --- arch/s390/kvm/kvm-s390.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ffb20a64d328..8a3d55410f06 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3542,7 +3542,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) vcpu->arch.gmap = vcpu->kvm->arch.gmap; sca_add_vcpu(vcpu); } - if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0) + if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0 || + vcpu->kvm->arch.user_operexec) vcpu->arch.sie_block->ictl |= ICTL_OPEREXC; } -- cgit v1.2.3 From 1a4794f17d0f279c55079717bc02d01ec9893eb3 Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Thu, 7 May 2026 22:08:35 +0200 Subject: KVM: s390: selftests: Extended user_operexec tests There is a possibility that the user_operexec capability only works if facility bit 74 is enabled. This is now fixed, but add a selftest to demonstrate that. Signed-off-by: Eric Farman Acked-by: Janosch Frank Reviewed-by: Claudio Imbrenda Signed-off-by: Claudio Imbrenda Message-ID: <20260507200836.3500368-3-farman@linux.ibm.com> --- .../testing/selftests/kvm/include/s390/facility.h | 6 ++ tools/testing/selftests/kvm/s390/user_operexec.c | 110 +++++++++++++++++++++ 2 files changed, 116 insertions(+) diff --git a/tools/testing/selftests/kvm/include/s390/facility.h b/tools/testing/selftests/kvm/include/s390/facility.h index 41a265742666..e5259f63be22 100644 --- a/tools/testing/selftests/kvm/include/s390/facility.h +++ b/tools/testing/selftests/kvm/include/s390/facility.h @@ -11,6 +11,7 @@ #ifndef SELFTEST_KVM_FACILITY_H #define SELFTEST_KVM_FACILITY_H +#include #include /* alt_stfle_fac_list[16] + stfle_fac_list[16] */ @@ -19,6 +20,11 @@ extern u64 stfl_doublewords[NB_STFL_DOUBLEWORDS]; extern bool stfle_flag; +static inline bool clear_bit_inv(unsigned long nr, unsigned long *ptr) +{ + return clear_bit(nr ^ (BITS_PER_LONG - 1), ptr); +} + static inline bool test_bit_inv(unsigned long nr, const unsigned long *ptr) { return test_bit(nr ^ (BITS_PER_LONG - 1), ptr); diff --git a/tools/testing/selftests/kvm/s390/user_operexec.c b/tools/testing/selftests/kvm/s390/user_operexec.c index 714906c1d12a..b24c1f9dbbe8 100644 --- a/tools/testing/selftests/kvm/s390/user_operexec.c +++ b/tools/testing/selftests/kvm/s390/user_operexec.c @@ -6,6 +6,7 @@ * Authors: * Janosch Frank */ +#include "facility.h" #include "kselftest.h" #include "kvm_util.h" #include "test_util.h" @@ -109,6 +110,111 @@ static void test_user_operexec_combined(void) kvm_vm_free(vm); } +static struct kvm_vm *create_vm_without_sthyi(void) +{ + struct kvm_s390_vm_cpu_processor info; + struct kvm_vm *vm; + + vm = vm_create(1); + + kvm_device_attr_get(vm->fd, KVM_S390_VM_CPU_MODEL, + KVM_S390_VM_CPU_PROCESSOR, &info); + + clear_bit_inv(74, (unsigned long *)&info.fac_list); + kvm_device_attr_set(vm->fd, KVM_S390_VM_CPU_MODEL, + KVM_S390_VM_CPU_PROCESSOR, &info); + + return vm; +} + +static void test_user_instr0_no_stfle_74(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int rc; + + vm = create_vm_without_sthyi(); + + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu = vm_vcpu_add(vm, 0, guest_code_instr0); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0000); + + kvm_vm_free(vm); +} + +static void test_user_operexec_no_stfle_74(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int rc; + + vm = create_vm_without_sthyi(); + + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu = vm_vcpu_add(vm, 0, guest_code_user_operexec); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0807); + + kvm_vm_free(vm); +} + +static void test_instr0_combined_no_stfle_74(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int rc; + + vm = create_vm_without_sthyi(); + + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0); + TEST_ASSERT_EQ(0, rc); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu = vm_vcpu_add(vm, 0, guest_code_instr0); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0000); + + kvm_vm_free(vm); +} + +static void test_operexec_combined_no_stfle_74(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int rc; + + vm = create_vm_without_sthyi(); + + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0); + TEST_ASSERT_EQ(0, rc); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu = vm_vcpu_add(vm, 0, guest_code_user_operexec); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0807); + + kvm_vm_free(vm); +} + /* * Run all tests above. * @@ -122,6 +228,10 @@ static struct testdef { { "instr0", test_user_instr0 }, { "operexec", test_user_operexec }, { "operexec_combined", test_user_operexec_combined}, + { "instr0_no_stfle_74", test_user_instr0_no_stfle_74 }, + { "instr0_combined_no_stfle_74", test_instr0_combined_no_stfle_74 }, + { "operexec_combined_no_stfle_74", test_operexec_combined_no_stfle_74 }, + { "operexec_no_stfle_74", test_user_operexec_no_stfle_74 }, }; int main(int argc, char *argv[]) -- cgit v1.2.3 From 2d7d4366d0a6f313b454a533ea0e6a00755df8cf Mon Sep 17 00:00:00 2001 From: Eric Farman Date: Thu, 7 May 2026 22:08:36 +0200 Subject: KVM: s390: Fix typo in UCONTROL documentation Small typo noticed while writing the USER_OPEREXEC selftest. Signed-off-by: Eric Farman Reviewed-by: Hendrik Brueckner Reviewed-by: Claudio Imbrenda Message-ID: <20260507200836.3500368-4-farman@linux.ibm.com> Signed-off-by: Claudio Imbrenda --- Documentation/virt/kvm/api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 52bbbb553ce1..f0eba90602f0 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6827,7 +6827,7 @@ s390 specific. } s390_ucontrol; s390 specific. A page fault has occurred for a user controlled virtual -machine (KVM_VM_S390_UNCONTROL) on its host page table that cannot be +machine (KVM_VM_S390_UCONTROL) on its host page table that cannot be resolved by the kernel. The program code and the translation exception code that were placed in the cpu's lowcore are presented here as defined by the z Architecture -- cgit v1.2.3 From d4bb00704a66024502261fa7a523c07420249fea Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 23 Jun 2026 17:33:22 +0200 Subject: s390/mm: Fix handling of _PAGE_UNUSED pte bit The _PAGE_UNUSED softbit should not really be lying around. Its sole purpose is to signal to try_to_unmap_one() and try_to_migrate_one() that the page can be discarded instead of being moved / swapped. KVM has no way to know why a page is being unmapped, so it sets the bit on userspace ptes corresponding to unused guest pages every time they get unmapped. KVM has no reasonable way to clear the bit once the page is in use again. While set_ptes() checks and clears the bit, other paths that set new ptes did not. This led to used pages being thrown out as if they were unused, causing guest corruption. Fix the issue by clearing the _PAGE_UNUSED bit for present ptes in set_pte(), i.e. whenever a present pte is getting set. The check in set_ptes() is then redundant and can be removed. Also fix gmap_helper_try_set_pte_unused() to only set the bit if the pte is present; the _PAGE_UNUSED bit is only defined for present ptes and thus should not be set for non-present ptes. Fixes: c98175b7917f ("KVM: s390: Add gmap_helper_set_unused()") Acked-by: Heiko Carstens Signed-off-by: Claudio Imbrenda Message-ID: <20260623153331.233784-2-imbrenda@linux.ibm.com> --- arch/s390/include/asm/pgtable.h | 4 ++-- arch/s390/mm/gmap_helpers.c | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 2c6cee8241e0..4740c75649eb 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -980,6 +980,8 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) static inline void set_pte(pte_t *ptep, pte_t pte) { + if (pte_present(pte)) + pte = clear_pte_bit(pte, __pgprot(_PAGE_UNUSED)); WRITE_ONCE(*ptep, pte); } @@ -1332,8 +1334,6 @@ pgprot_t pgprot_writecombine(pgprot_t prot); static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t entry, unsigned int nr) { - if (pte_present(entry)) - entry = clear_pte_bit(entry, __pgprot(_PAGE_UNUSED)); page_table_check_ptes_set(mm, addr, ptep, entry, nr); for (;;) { set_pte(ptep, entry); diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index 1cfe4724fbe2..60023b6fdcb1 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -181,7 +181,8 @@ void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr) if (IS_ERR_OR_NULL(ptep)) return; - __atomic64_or(_PAGE_UNUSED, (long *)ptep); + if (pte_present(*ptep)) + __atomic64_or(_PAGE_UNUSED, (long *)ptep); pte_unmap_unlock(ptep, ptl); } EXPORT_SYMBOL_GPL(gmap_helper_try_set_pte_unused); -- cgit v1.2.3 From 7a386efcb2bf986e0c9011e92a78aed0870b08cf Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 23 Jun 2026 17:33:23 +0200 Subject: KVM: s390: Fix dat_peek_cmma() overflow If userspace passes a start address that is out of bounds, _dat_walk_gfn_range() will fail with -EFAULT, but state.end will not be touched and will stay 0. This will cause *count to underflow and report a very high number, and the function will end up erroneously reporting success. Fix by only setting *count if the end address is not smaller than the starting address. This way invalid starting addresses will correctly return -EFAULT and *count will correctly indicate that no values have been returned. Fixes: 7b368470e1a4 ("KVM: s390: KVM page table management functions: CMMA") Reviewed-by: Christian Borntraeger Signed-off-by: Claudio Imbrenda Message-ID: <20260623153331.233784-3-imbrenda@linux.ibm.com> --- arch/s390/kvm/dat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c index 4a41c0247ffa..cffac7782c4b 100644 --- a/arch/s390/kvm/dat.c +++ b/arch/s390/kvm/dat.c @@ -1209,7 +1209,7 @@ int dat_peek_cmma(gfn_t start, union asce asce, unsigned int *count, u8 *values) int rc; rc = _dat_walk_gfn_range(start, start + *count, asce, &ops, DAT_WALK_DEFAULT, &state); - *count = state.end - start; + *count = state.end >= start ? state.end - start : 0; /* Return success if at least one value was saved, otherwise an error. */ return (rc == -EFAULT && *count > 0) ? 0 : rc; } -- cgit v1.2.3 From e6c9b322c8cb3c08270f05e2faabd7c0cc82f809 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 23 Jun 2026 17:33:24 +0200 Subject: KVM: s390: Do not set special large pages dirty Special pages / folios should not be set dirty. This also applies to large pages. Add a missing check in gmap_clear_young_crste() to prevent setting the large page dirty if it is a special page. Fixes: a2c17f9270cc ("KVM: s390: New gmap code") Signed-off-by: Claudio Imbrenda Message-ID: <20260623153331.233784-4-imbrenda@linux.ibm.com> --- arch/s390/kvm/gmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c index 52d55ddea8d4..3192f610f696 100644 --- a/arch/s390/kvm/gmap.c +++ b/arch/s390/kvm/gmap.c @@ -327,7 +327,7 @@ static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, st new.h.i = 1; new.s.fc1.y = 0; new.s.fc1.prefix_notif = 0; - if (new.s.fc1.d || !new.h.p) + if ((new.s.fc1.d || !new.h.p) && !new.s.fc1.s) folio_set_dirty(phys_to_folio(crste_origin_large(crste))); new.s.fc1.d = 0; new.h.p = 1; -- cgit v1.2.3 From 6e976afdfeafeb48f002b977823f67c6a3dd70a0 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 23 Jun 2026 17:33:25 +0200 Subject: KVM: s390: Fix code typo in gmap_protect_asce_top_level() The correct length to pass to kvm_s390_get_guest_pages() is asce.tl + 1, not asce.dt + 1. It was a typo, which, due to fortuitous circumstances, did not cause bugs. It should nonetheless be fixed. Fixes: e5f98a6899bd ("KVM: s390: Add some helper functions needed for vSIE") Signed-off-by: Claudio Imbrenda Message-ID: <20260623153331.233784-5-imbrenda@linux.ibm.com> --- arch/s390/kvm/gmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c index 3192f610f696..e6e786811db8 100644 --- a/arch/s390/kvm/gmap.c +++ b/arch/s390/kvm/gmap.c @@ -1262,7 +1262,7 @@ static int gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gma /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ smp_rmb(); - rc = kvm_s390_get_guest_pages(sg->kvm, context.f, asce.rsto, asce.dt + 1, false); + rc = kvm_s390_get_guest_pages(sg->kvm, context.f, asce.rsto, asce.tl + 1, false); if (rc > 0) rc = -EFAULT; if (!rc) -- cgit v1.2.3 From 2bd74dce0814acc382cfd6903ec902fdcd7b0fed Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 23 Jun 2026 17:33:26 +0200 Subject: KVM: s390: Fix handle_{sske,pfmf} under memory pressure Under heavy memory pressure, handle_sske() and handle_pfmf() might cause an endless loop if the mmu cache runs empty, the atomic allocations fail, and the top-up function also fails. While quite unlikely, that scenario is not impossible. Fix the issue by not ignoring the return value of kvm_s390_mmu_cache_topup(), and appropriately returning an error code in case of failure. Fixes: e38c884df921 ("KVM: s390: Switch to new gmap") Reviewed-by: Christian Borntraeger Signed-off-by: Claudio Imbrenda Message-ID: <20260623153331.233784-6-imbrenda@linux.ibm.com> --- arch/s390/kvm/priv.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 447ec7ed423d..9bc6fd02ff77 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -366,7 +366,9 @@ static int handle_sske(struct kvm_vcpu *vcpu) if (rc > 1) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); if (rc == -ENOMEM) { - kvm_s390_mmu_cache_topup(vcpu->arch.mc); + rc = kvm_s390_mmu_cache_topup(vcpu->arch.mc); + if (rc) + return rc; continue; } if (rc < 0) @@ -1122,7 +1124,9 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) if (rc > 1) return kvm_s390_inject_program_int(vcpu, rc); if (rc == -ENOMEM) { - kvm_s390_mmu_cache_topup(vcpu->arch.mc); + rc = kvm_s390_mmu_cache_topup(vcpu->arch.mc); + if (rc) + return rc; continue; } if (rc < 0) -- cgit v1.2.3 From 9b0bf9b93cbff50764713b62d0f38d5238eea8c8 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 23 Jun 2026 17:33:27 +0200 Subject: KVM: s390: Fix locking in kvm_s390_set_mem_control() Add the missing locking around dat_reset_cmma(). Fixes: e38c884df921 ("KVM: s390: Switch to new gmap") Signed-off-by: Claudio Imbrenda Message-ID: <20260623153331.233784-7-imbrenda@linux.ibm.com> --- arch/s390/kvm/kvm-s390.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 8a3d55410f06..221b2fb199d4 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -990,9 +990,11 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att if (!kvm->arch.use_cmma) break; + guard(mutex)(&kvm->lock); VM_EVENT(kvm, 3, "%s", "RESET: CMMA states"); do { - start_gfn = dat_reset_cmma(kvm->arch.gmap->asce, start_gfn); + scoped_guard(read_lock, &kvm->mmu_lock) + start_gfn = dat_reset_cmma(kvm->arch.gmap->asce, start_gfn); cond_resched(); } while (start_gfn); ret = 0; -- cgit v1.2.3 From 6cfd47f91f6aa3bcf9fe15388be52feb4b180440 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 23 Jun 2026 17:33:28 +0200 Subject: KVM: s390: Fix cmma dirty tracking It is possible that some guest memory areas have not been touched yet when starting migration mode, and thus have no ptes allocated. Only existing and allocated ptes should count toward the total of dirty cmma entries. When starting migration mode, enable the migration_mode flag immediately, so that any subsequent ESSA will trap in the host and cause cmma_dirty_pages to be increased as needed. Subsequently, set the cmma_d bit on all existing cmma-clean PGSTEs, increasing cmma_dirty_pages as needed. Skipping cmma-dirty pages prevents double counting. Conversely, when disabling migration mode, set cmma_dirty_pages to 0 and clear the cmma_d bit in all existing PGSTEs. The invariant is that when migration mode is off, no PGSTE has its cmma_d bit set, and cmma_dirty_pages is 0. kvm->slots_lock protects kvm_s390_vm_start_migration() and kvm_s390_vm_stop_migration() from each other and from kvm_s390_get_cmma_bits(). Also fix dat_get_cmma() to properly wrap around if the first attempt reached the end of guest memory without finding cmma-dirty pages. [ imbrenda: Moved kvm_s390_sync_request_broadcast() before gmap_set_cmma_all_dirty() ] Fixes: e38c884df921 ("KVM: s390: Switch to new gmap") Signed-off-by: Claudio Imbrenda Message-ID: <20260623153331.233784-8-imbrenda@linux.ibm.com> --- arch/s390/kvm/dat.c | 3 +++ arch/s390/kvm/gmap.c | 31 +++++++++++++++++++++++++++---- arch/s390/kvm/gmap.h | 12 +++++++++++- arch/s390/kvm/kvm-s390.c | 46 ++++++++++++++++++++++++++++++++++++---------- arch/s390/kvm/priv.c | 2 +- 5 files changed, 78 insertions(+), 16 deletions(-) diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c index cffac7782c4b..0ad4ebc80eba 100644 --- a/arch/s390/kvm/dat.c +++ b/arch/s390/kvm/dat.c @@ -1253,6 +1253,9 @@ int dat_get_cmma(union asce asce, gfn_t *start, unsigned int *count, u8 *values, }; _dat_walk_gfn_range(*start, asce_end(asce), asce, &ops, DAT_WALK_IGN_HOLES, &state); + /* If no dirty pages were found, wrap around and continue searching */ + if (*start && state.start == -1) + _dat_walk_gfn_range(0, *start, asce, &ops, DAT_WALK_IGN_HOLES, &state); if (state.start == -1) { *count = 0; diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c index e6e786811db8..0f944944badf 100644 --- a/arch/s390/kvm/gmap.c +++ b/arch/s390/kvm/gmap.c @@ -1073,23 +1073,46 @@ int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gf return 0; } +static long __set_cmma_clean_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) +{ + union pgste pgste; + + pgste = pgste_get_lock(ptep); + pgste.cmma_d = 0; + pgste_set_unlock(ptep, pgste); + + if (need_resched()) + return next; + return 0; +} + static long __set_cmma_dirty_pte(union pte *ptep, gfn_t gfn, gfn_t next, struct dat_walk *walk) { - __atomic64_or(PGSTE_CMMA_D_BIT, &pgste_of(ptep)->val); + union pgste pgste; + + pgste = pgste_get_lock(ptep); + if (!pgste.cmma_d) + atomic64_inc(walk->priv); + pgste.cmma_d = 1; + pgste_set_unlock(ptep, pgste); + if (need_resched()) return next; return 0; } -void gmap_set_cmma_all_dirty(struct gmap *gmap) +void _gmap_set_cmma_all(struct gmap *gmap, bool dirty) { - const struct dat_walk_ops ops = { .pte_entry = __set_cmma_dirty_pte, }; + const struct dat_walk_ops ops = { + .pte_entry = dirty ? __set_cmma_dirty_pte : __set_cmma_clean_pte, + }; gfn_t gfn = 0; do { scoped_guard(read_lock, &gmap->kvm->mmu_lock) gfn = _dat_walk_gfn_range(gfn, asce_end(gmap->asce), gmap->asce, &ops, - DAT_WALK_IGN_HOLES, NULL); + DAT_WALK_IGN_HOLES, + &gmap->kvm->arch.cmma_dirty_pages); cond_resched(); } while (gfn); } diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h index 5374f21aaf8d..4e04fbd07696 100644 --- a/arch/s390/kvm/gmap.h +++ b/arch/s390/kvm/gmap.h @@ -103,7 +103,7 @@ int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interr int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level); int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, kvm_pfn_t pfn, int level, bool wr); -void gmap_set_cmma_all_dirty(struct gmap *gmap); +void _gmap_set_cmma_all(struct gmap *gmap, bool dirty); void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn); struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, union asce asce, int edat_level); @@ -197,6 +197,16 @@ static inline bool pte_needs_unshadow(union pte oldpte, union pte newpte, union return !newpte.h.p || !newpte.s.pr; } +static inline void gmap_set_cmma_all_dirty(struct gmap *gmap) +{ + _gmap_set_cmma_all(gmap, true); +} + +static inline void gmap_set_cmma_all_clean(struct gmap *gmap) +{ + _gmap_set_cmma_all(gmap, false); +} + static inline union pgste _gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, union pte newpte, union pgste pgste, gfn_t gfn, bool needs_lock) { diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 221b2fb199d4..9ad6bd4edbce 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1187,13 +1187,13 @@ static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req) /* * Must be called with kvm->srcu held to avoid races on memslots, and with - * kvm->slots_lock to avoid races with ourselves and kvm_s390_vm_stop_migration. + * kvm->slots_lock to avoid races with ourselves, kvm_s390_vm_stop_migration(), + * and kvm_s390_get_cmma_bits(). */ static int kvm_s390_vm_start_migration(struct kvm *kvm) { struct kvm_memory_slot *ms; struct kvm_memslots *slots; - unsigned long ram_pages = 0; int bkt; /* migration mode already enabled */ @@ -1210,28 +1210,54 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm) kvm_for_each_memslot(ms, bkt, slots) { if (!ms->dirty_bitmap) return -EINVAL; - ram_pages += ms->npages; } - /* mark all the pages as dirty */ - gmap_set_cmma_all_dirty(kvm->arch.gmap); - atomic64_set(&kvm->arch.cmma_dirty_pages, ram_pages); - kvm->arch.migration_mode = 1; + /* + * Set the flag and let KVM handle ESSA manually, potentially setting + * the cmma_d bit in some PGSTEs and increasing cmma_dirty_pages. + * At this point cmma_dirty_pages is still 0, and all existing PGSTEs + * have their cmma_d bit set to 0. + * Any newly allocated page table has its entries marked as cmma-clean, + * which is fine because the CMMA values are not dirty. + */ + WRITE_ONCE(kvm->arch.migration_mode, 1); kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION); + /* + * Mark all PGSTEs as cmma-dirty, increasing cmma_dirty_pages as needed, + * but without double-counting pages that have become dirty on their own + * in the meantime. + * At this point some pages might have become dirty on their own already + * and cmma_dirty_pages might therefore be non-zero. + */ + gmap_set_cmma_all_dirty(kvm->arch.gmap); return 0; } /* - * Must be called with kvm->slots_lock to avoid races with ourselves and - * kvm_s390_vm_start_migration. + * Must be called with kvm->slots_lock to avoid races with ourselves, + * kvm_s390_vm_start_migration() and kvm_s390_get_cmma_bits(). */ static int kvm_s390_vm_stop_migration(struct kvm *kvm) { /* migration mode already disabled */ if (!kvm->arch.migration_mode) return 0; - kvm->arch.migration_mode = 0; + /* + * Unset the flag and propagate to all vCPUs. From now on the cmma_d + * bit will not be touched on any PGSTE. + * At this point cmma_dirty_pages is possibly non-zero, and thus some + * PGSTEs might have cmma_d set. + */ + WRITE_ONCE(kvm->arch.migration_mode, 0); if (kvm->arch.use_cmma) kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION); + /* Clear cmma_d on all existing PGSTEs and set cmma_dirty_pages to 0. */ + gmap_set_cmma_all_clean(kvm->arch.gmap); + atomic64_set(&kvm->arch.cmma_dirty_pages, 0); + /* + * At this point the system has the expected state: migration_mode is 0, + * cmma_dirty_pages is 0, and all existing PGSTEs have their cmma_d bit + * set to 0. + */ return 0; } diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 9bc6fd02ff77..ad0ddc433a73 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -1236,7 +1236,7 @@ static int handle_essa(struct kvm_vcpu *vcpu) : ESSA_SET_STABLE_IF_RESIDENT)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - if (!vcpu->kvm->arch.migration_mode) { + if (!READ_ONCE(vcpu->kvm->arch.migration_mode)) { /* * CMMA is enabled in the KVM settings, but is disabled in * the SIE block and in the mm_context, and we are not doing -- cgit v1.2.3 From 125a3d3fac51571b8ede0d0599618c6ecd975ea8 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 23 Jun 2026 17:33:29 +0200 Subject: KVM: s390: selftests: Fix cmma selftest The existing cmma selftest depended on the host allocating page tables for all present memslots. Since the gmap rewrite, memory that is not accessed by the guest might not have page tables allocated yet. This caused the test to fail due to a mismatch in the assertion. Fix by having the guest access also the second half of the test memslot, thus guaranteeing that its page tables are present. Fixes: e38c884df921 ("KVM: s390: Switch to new gmap") Signed-off-by: Claudio Imbrenda Message-ID: <20260623153331.233784-9-imbrenda@linux.ibm.com> --- tools/testing/selftests/kvm/s390/cmma_test.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/kvm/s390/cmma_test.c b/tools/testing/selftests/kvm/s390/cmma_test.c index e39a724fe860..15d81b2ed7ad 100644 --- a/tools/testing/selftests/kvm/s390/cmma_test.c +++ b/tools/testing/selftests/kvm/s390/cmma_test.c @@ -34,16 +34,22 @@ static char cmma_value_buf[MAIN_PAGE_COUNT + TEST_DATA_PAGE_COUNT]; /** * Dirty CMMA attributes of exactly one page in the TEST_DATA memslot, * so use_cmma goes on and the CMMA related ioctls do something. + * Touch the page at offset 1M inside TEST_DATA to make sure its page + * tables are allocated in the host. */ static void guest_do_one_essa(void) { asm volatile( /* load TEST_DATA_START_GFN into r1 */ + " xgr 1,1\n" " llilf 1,%[start_gfn]\n" /* calculate the address from the gfn */ " sllg 1,1,12(0)\n" /* set the first page in TEST_DATA memslot to STABLE */ " .insn rrf,0xb9ab0000,2,1,1,0\n" + " agfi 1,0x100000\n" + /* also touch the first page of the second MB of TEST_DATA */ + " .insn rrf,0xb9ab0000,2,1,1,0\n" /* hypercall */ " diag 0,0,0x501\n" "0: j 0b" -- cgit v1.2.3 From babe08404e1993697a523e60bc0f9d096ffe1ef8 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Tue, 23 Jun 2026 17:33:30 +0200 Subject: KVM: s390: Return failure in case of failure in kvm_s390_set_cmma_bits() If the allocation of the bits array failed, kvm_s390_set_cmma_bits() would return 0 instead of an error code. Rework the function to use the __free() macros and thus simplify the code flow; when the above mentioned allocation fails, simply return -ENOMEM. Fixes: e38c884df921 ("KVM: s390: Switch to new gmap") Reviewed-by: Christian Borntraeger Signed-off-by: Claudio Imbrenda Message-ID: <20260623153331.233784-10-imbrenda@linux.ibm.com> --- arch/s390/kvm/kvm-s390.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 9ad6bd4edbce..3b26c909ad0f 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2313,8 +2313,8 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm, static int kvm_s390_set_cmma_bits(struct kvm *kvm, const struct kvm_s390_cmma_log *args) { - struct kvm_s390_mmu_cache *mc; - u8 *bits = NULL; + struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL; + u8 *bits __free(kvfree) = NULL; int r = 0; if (!kvm->arch.use_cmma) @@ -2334,18 +2334,16 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm, return -ENOMEM; bits = vmalloc(array_size(sizeof(*bits), args->count)); if (!bits) - goto out; + return -ENOMEM; r = copy_from_user(bits, (void __user *)args->values, args->count); - if (r) { - r = -EFAULT; - goto out; - } + if (r) + return -EFAULT; do { r = kvm_s390_mmu_cache_topup(mc); if (r) - break; + return r; scoped_guard(read_lock, &kvm->mmu_lock) { r = dat_set_cmma_bits(mc, kvm->arch.gmap->asce, args->start_gfn, args->count, args->mask, bits); @@ -2353,9 +2351,7 @@ static int kvm_s390_set_cmma_bits(struct kvm *kvm, } while (r == -ENOMEM); set_bit(GMAP_FLAG_USES_CMM, &kvm->arch.gmap->flags); -out: - kvm_s390_free_mmu_cache(mc); - vfree(bits); + return r; } -- cgit v1.2.3 From f1edbed787ba67988ed34e0132ca128b052b6ce8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Jun 2026 15:52:41 -0700 Subject: KVM: Replace guest-triggerable BUG_ON() in ioeventfd datamatch with get_unaligned() Drop a BUG_ON() that has been reachable since it was first added, way back in 2009, and instead use get_unaligned() to perform potentially-unaligned accesses. For a given store, KVM x86's emulator tracks the entire value in the destination operand, x86_emulate_ctxt.dst. If the destination is memory, and the target splits multiple pages and/or is emulated MMIO, then KVM handles each fragment independently. E.g. on a page split starting at page offset 0xffc, KVM writes 4 bytes to the first page, then the remaining bytes to the second page, using ctxt->dst as the source for both (with appropriate offsets). If the destination splits a page *and* hits emulated MMIO on the second page, then KVM will complete the write to the first page, then emulate the MMIO access to the second page. If there is a datamatch-enabled ioeventfd at offset 0 of the second page, then KVM will process the remainder of the store as a potential ioeventfd signal. Putting it all together, if the guest emits a store that splits a page starting at page offset N, and the second page has a datamatch-enabled ioeventfd at offset 0, then KVM will check for datamatch using &dst.valptr[N] as the source. Due to dst (and thus dst.valptr) being 32-byte aligned, if N is not aligned to @len, the BUG_ON() fires. E.g. with a 16-byte store at page offset 0xffc, to an ioeventfd of len 8, all initial checks in ioeventfd_in_range() will succeed, and the BUG_ON() fires due to @val being 4-byte aligned, but not 8-byte aligned. ------------[ cut here ]------------ kernel BUG at arch/x86/kvm/../../../virt/kvm/eventfd.c:783! Oops: invalid opcode: 0000 [#1] SMP CPU: 0 UID: 1000 PID: 615 Comm: repro Not tainted 7.1.0-rc2-ff238429d1ea #365 PREEMPT Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:ioeventfd_write+0x6c/0x70 [kvm] Call Trace: __kvm_io_bus_write+0x85/0xb0 [kvm] kvm_io_bus_write+0x53/0x80 [kvm] vcpu_mmio_write+0x66/0xf0 [kvm] emulator_read_write_onepage+0x12a/0x540 [kvm] emulator_read_write+0x109/0x2b0 [kvm] x86_emulate_insn+0x4f8/0xfb0 [kvm] x86_emulate_instruction+0x181/0x790 [kvm] kvm_mmu_page_fault+0x313/0x630 [kvm] vmx_handle_exit+0x18a/0x590 [kvm_intel] kvm_arch_vcpu_ioctl_run+0xc81/0x1c90 [kvm] kvm_vcpu_ioctl+0x2d5/0x970 [kvm] __x64_sys_ioctl+0x8a/0xd0 do_syscall_64+0xb7/0x890 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7f19c931a9bf Modules linked in: kvm_intel kvm irqbypass ---[ end trace 0000000000000000 ]--- In a perfect world, the fix would be to simply delete the BUG_ON(), as KVM x86 doesn't perform alignment checks on "normal" memory accesses at CPL0. Sadly, C99 ruins all the fun; while the x86 architecture plays nice, dereferencing an unaligned pointer directly is undefined behavior in C, e.g. triggers splats when running with CONFIG_UBSAN_ALIGNMENT=y. Fixes: d34e6b175e61 ("KVM: add ioeventfd support") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20260612225241.678509-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/eventfd.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 0e8b8a2c5b79..93ad2ebc963f 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -779,21 +780,18 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) return true; /* otherwise, we have to actually compare the data */ - - BUG_ON(!IS_ALIGNED((unsigned long)val, len)); - switch (len) { case 1: - _val = *(u8 *)val; + _val = get_unaligned((u8 *)val); break; case 2: - _val = *(u16 *)val; + _val = get_unaligned((u16 *)val); break; case 4: - _val = *(u32 *)val; + _val = get_unaligned((u32 *)val); break; case 8: - _val = *(u64 *)val; + _val = get_unaligned((u64 *)val); break; default: return false; -- cgit v1.2.3 From 39e9c35e447b8ab49ddbc826c40ca31a425e76b2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Jun 2026 11:57:45 -0700 Subject: KVM: x86: Replace BUG_ON() with WARN_ON_ONCE() on "bad" nested GPA translation If KVM attempts to translate what it thinks is an L2 GPA with a non-nested MMU, simply WARN and return the GPA, i.e. trust the MMU more than the caller, as there is zero reason to potentially panic the host kernel just because KVM misused an API. Signed-off-by: Sean Christopherson Reviewed-by: Kai Huang Message-ID: <20260618185746.2023283-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 3 ++- arch/x86/kvm/vmx/nested.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 9aedb88c832d..3e6c671a8dc2 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -2152,7 +2152,8 @@ static gpa_t svm_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, struct vcpu_svm *svm = to_svm(vcpu); struct kvm_mmu *mmu = vcpu->arch.mmu; - BUG_ON(!mmu_is_nested(vcpu)); + if (WARN_ON_ONCE(!mmu_is_nested(vcpu))) + return gpa; /* Non-GMET walks are always user-walks */ if (!(svm->nested.ctl.misc_ctl & SVM_MISC_ENABLE_GMET)) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 3a293640d58c..6957bb6f5cf7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -7470,7 +7470,8 @@ static gpa_t vmx_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, { struct kvm_mmu *mmu = vcpu->arch.mmu; - BUG_ON(!mmu_is_nested(vcpu)); + if (WARN_ON_ONCE(!mmu_is_nested(vcpu))) + return gpa; /* * MBEC differentiates based on the effective U/S bit of -- cgit v1.2.3 From 8e5d793fc7173587cfdc075d2bb4a8d016fa050f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Jun 2026 11:56:41 -0700 Subject: KVM: x86/mmu: Bug the VM, not the host kernel, if KVM write-protects upper SPTEs Instead of bugging the host kernel, WARN and terminate the VM if KVM attempts to write-protect at a level that cannot use leaf SPTEs. There is no reason to bring down the entire host; even termininating the VM is likely overkill, but in theory a missed write could corrupt guest memory, so play it safe. Signed-off-by: Sean Christopherson Reviewed-by: Kai Huang Message-ID: <20260618185641.2022368-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 5b3041138301..c1cbae65d239 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1410,9 +1410,10 @@ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; - rcu_read_lock(); + if (KVM_BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL, kvm)) + return false; - BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); + rcu_read_lock(); for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) { retry: @@ -1844,7 +1845,8 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, u64 new_spte; bool spte_set = false; - BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); + if (KVM_BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL, kvm)) + return false; rcu_read_lock(); -- cgit v1.2.3 From ac604b56115d9936a0876da46033b110cfab7f58 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Jun 2026 11:53:50 -0700 Subject: KVM: x86: Bug the VM, not the kernel, if the ISR count {under,over}flows Bug the VM, not the host kernel, if KVM's ISR count {under,over}flows when tracking in-flight ISRs. There is zero danger to the host if KVM messes up its IRQ tracking. Signed-off-by: Sean Christopherson Reviewed-by: Kai Huang Message-ID: <20260618185350.2020845-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9d2df8623f6d..e733ca6b9792 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -767,7 +767,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) kvm_x86_call(hwapic_isr_update)(apic->vcpu, vec); else { ++apic->isr_count; - BUG_ON(apic->isr_count > MAX_APIC_VECTOR); + KVM_BUG_ON(apic->isr_count > MAX_APIC_VECTOR, apic->vcpu->kvm); /* * ISR (in service register) bit is set when injecting an interrupt. * The highest vector is injected. Thus the latest bit set matches @@ -808,7 +808,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) kvm_x86_call(hwapic_isr_update)(apic->vcpu, apic_find_highest_isr(apic)); else { --apic->isr_count; - BUG_ON(apic->isr_count < 0); + KVM_BUG_ON(apic->isr_count < 0, apic->vcpu->kvm); apic->highest_isr_cache = -1; } } -- cgit v1.2.3 From ea3c9959213641cbcf53add220d3213f16042419 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Jun 2026 11:52:13 -0700 Subject: KVM: x86: WARN and fail kvm_set_irq() if a PIC or I/O APIC vector is invalid WARN and return an error up the stack if the PIC or I/O APIC encounters an invalid vector when injecting an IRQ, as there is no danger to the host and thus no justification for potentially panicking the kernel. Don't bug the VM either, as the risk of corrupting the guest is minuscule, and the guest might even be completely tolerant of a lost interrupt. Signed-off-by: Sean Christopherson Reviewed-by: Kai Huang Message-ID: <20260618185213.2019937-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/i8259.c | 3 ++- arch/x86/kvm/ioapic.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 59e28c45d7dc..6a942ac622d5 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -194,7 +194,8 @@ int kvm_pic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq = e->irqchip.pin; int ret, irq_level; - BUG_ON(irq < 0 || irq >= PIC_NUM_PINS); + if (WARN_ON_ONCE(irq < 0 || irq >= PIC_NUM_PINS)) + return -1; pic_lock(s); irq_level = __kvm_irq_line_state(&s->irq_states[irq], diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index f3f4a483ca15..88bd226f3b73 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -504,7 +504,8 @@ int kvm_ioapic_set_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq = e->irqchip.pin; int ret, irq_level; - BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS); + if (WARN_ON_ONCE(irq < 0 || irq >= IOAPIC_NUM_PINS)) + return -1; spin_lock(&ioapic->lock); irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq], -- cgit v1.2.3 From cc3d0e1afd1077796df72da85e0da5266fd532f2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Jun 2026 10:45:27 -0700 Subject: KVM: x86: WARN (once) if RTC pending EOI tracking goes off the rails WARN once if KVM's tracking for pending EOIs for Real-Time Clock IRQs goes off the rails, as there's no reason to bug the host or risk a DoS due to spamming dmesg with endless WARNs. Absolute worst case scenario, guest time will go awry. Signed-off-by: Sean Christopherson Reviewed-by: Kai Huang Message-ID: <20260618174527.1982333-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 88bd226f3b73..757667fb2bfa 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -84,7 +84,7 @@ static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic); static void rtc_status_pending_eoi_check_valid(struct kvm_ioapic *ioapic) { - if (WARN_ON(ioapic->rtc_status.pending_eoi < 0)) + if (WARN_ON_ONCE(ioapic->rtc_status.pending_eoi < 0)) kvm_rtc_eoi_tracking_restore_all(ioapic); } @@ -484,7 +484,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) * ensures that it is only called if it is >= zero, namely * if rtc_irq_check_coalesced returns false). */ - BUG_ON(ioapic->rtc_status.pending_eoi != 0); + WARN_ON_ONCE(ioapic->rtc_status.pending_eoi); ret = __kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, &ioapic->rtc_status); ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret); -- cgit v1.2.3 From 7ef78d71ca713d8c00f7c34ddcf276c808143f77 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Jun 2026 10:43:46 -0700 Subject: KVM: VMX: Grab vmcs12 on CR8 interception update iff vCPU is in guest mode When updating CR8 intercepts, get vmcs12 if and only if the vCPU is in guest mode so that a future change can have update CR8 intercepts during vCPU creation, without running afoul of get_vmcs12()'s lockdep assertion. ------------[ cut here ]------------ debug_locks && !(lock_is_held(&(&vcpu->mutex)->dep_map) || !refcount_read(&vcpu->kvm->users_count)) WARNING: arch/x86/kvm/vmx/nested.h:61 at get_vmcs12 arch/x86/kvm/vmx/nested.h:60 [inline], CPU#0: syz.2.19/5879 WARNING: arch/x86/kvm/vmx/nested.h:61 at vmx_update_cr8_intercept+0x3de/0x4e0 arch/x86/kvm/vmx/vmx.c:6879, CPU#0: syz.2.19/5879 Modules linked in: CPU: 0 UID: 0 PID: 5879 Comm: syz.2.19 Not tainted syzkaller #0 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 RIP: 0010:get_vmcs12 arch/x86/kvm/vmx/nested.h:60 [inline] RIP: 0010:vmx_update_cr8_intercept+0x3de/0x4e0 arch/x86/kvm/vmx/vmx.c:6879 Call Trace: apic_update_ppr arch/x86/kvm/lapic.c:984 [inline] kvm_lapic_reset+0x1c24/0x2980 arch/x86/kvm/lapic.c:3023 kvm_vcpu_reset+0x44c/0x1bf0 arch/x86/kvm/x86.c:12986 kvm_arch_vcpu_create+0x746/0x8b0 arch/x86/kvm/x86.c:12847 kvm_vm_ioctl_create_vcpu+0x428/0x930 virt/kvm/kvm_main.c:4201 kvm_vm_ioctl+0x893/0xd50 virt/kvm/kvm_main.c:5159 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:597 [inline] __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:583 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0x174/0x580 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f No functional change intended. Reported-by: syzbot ci Closes: https://lore.kernel.org/all/6a2adf3b.3b0a2d4e.8c8d1.0012.GAE@google.com Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20260618174347.1981064-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a1a5edb39a7e..125994ed3db5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6872,11 +6872,10 @@ int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); int tpr_threshold; if (is_guest_mode(vcpu) && - nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) + nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_TPR_SHADOW)) return; guard(vmx_vmcs01)(vcpu); -- cgit v1.2.3 From bb365a506b1e6fb050c0fceaad354fe395385ef0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20L=C3=B3pez?= Date: Thu, 18 Jun 2026 10:43:47 -0700 Subject: KVM: x86: Unconditionally recompute CR8 intercept on PPR update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TPR_THRESHOLD field in the VMCS is used by VMX to induce VM exits when the guest's virtual TPR falls under the specified threshold, allowing KVM to inject previously masked interrupts. KVM handles these VM exits in handle_tpr_below_threshold(). Commit eb90f3417a0c ("KVM: vmx: speed up TPR below threshold vmexits") optimized this function by calling apic_update_ppr() instead of raising KVM_REQ_EVENT. apic_update_ppr() then raises KVM_REQ_EVENT if there is a pending, deliverable interrupt. However, if there are no new interrupts pending, apic_update_ppr() does not issue the request. Thus, kvm_lapic_update_cr8_intercept() and vmx_update_cr8_intercept() are not called before VM entry, which results in a high, stale TPR_THRESHOLD. This is problematic due to the following sentence in 28.2.1.1 "VM-Execution Control Fields" in the SDM: The following check is performed if the “use TPR shadow” VM-execution control is 1 and the “virtualize APIC accesses” and “virtual-interrupt delivery” VM-execution controls are both 0: the value of bits 3:0 of the TPR threshold VM-execution control field should not be greater than the value of bits 7:4 of VTPR. This error condition is typically not observed when KVM runs on a bare metal system because modern processors support APICv, which enables virtual-interrupt delivery, and which KVM uses when possible. This causes the processor to no longer generate TPR-below-threshold exits and to no longer check TPR_THRESHOLD on entry. However, when running on older platforms, or under nested virtualization on a hypervisor that does not support virtual-interrupt delivery and enforces this check (like Hyper-V) this can cause a VM entry failure with hardware error 0x7, as seen in [1]. Call kvm_lapic_update_cr8_intercept() if apic_update_ppr() does not find a deliverable interrupt (and thus does not raise KVM_REQ_EVENT). Remove calls to kvm_lapic_update_cr8_intercept() on paths that end up in apic_update_ppr(), as they now become redundant. This ensures that any path that updates the guest's PPR also figures out if KVM needs to wait for a TPR change (using TPR_THRESHOLD on VMX or CR8 intercepts on SVM). Link: https://github.com/coconut-svsm/svsm/issues/1081 [1] Tested-by: Stefano Garzarella Cc: stable@vger.kernel.org Fixes: eb90f3417a0c ("KVM: vmx: speed up TPR below threshold vmexits") Signed-off-by: Carlos López Signed-off-by: Sean Christopherson Message-ID: <20260618174347.1981064-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 ++ arch/x86/kvm/x86.c | 5 +---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e733ca6b9792..6f30bbdddb5a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -980,6 +980,8 @@ static void apic_update_ppr(struct kvm_lapic *apic) if (__apic_update_ppr(apic, &ppr) && apic_has_interrupt_for_ppr(apic, ppr) != -1) kvm_make_request(KVM_REQ_EVENT, apic->vcpu); + else + kvm_lapic_update_cr8_intercept(apic->vcpu); } void kvm_apic_update_ppr(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d9d51803b7b2..96c465040756 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5317,7 +5317,6 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, r = kvm_apic_set_state(vcpu, s); if (r) return r; - kvm_lapic_update_cr8_intercept(vcpu); return 0; } @@ -12418,8 +12417,6 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, kvm_register_mark_dirty(vcpu, VCPU_REG_CR3); kvm_x86_call(post_set_cr3)(vcpu, sregs->cr3); - kvm_set_cr8(vcpu, sregs->cr8); - *mmu_reset_needed |= vcpu->arch.efer != sregs->efer; kvm_x86_call(set_efer)(vcpu, sregs->efer); @@ -12448,7 +12445,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - kvm_lapic_update_cr8_intercept(vcpu); + kvm_set_cr8(vcpu, sregs->cr8); /* Older userspace won't unhalt the vcpu on reset. */ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && -- cgit v1.2.3 From 02953418a1378514d1f4086180f14004f5d08ea5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Jun 2026 06:37:27 -0700 Subject: KVM: x86/mmu: Expose number of shadow MMU shadow pages as a stat Turn arch.n_used_mmu_pages into a stat, mmu_shadow_pages, as the number of live shadow pages is arguably _the_ most critical datapoint when it comes to analyzing the shadow MMU. Before the TDP MMU came along, i.e. when the shadow MMU was the only MMU, explicitly tracking the number of shadow pages wasn't as interesting, because the same information could more or less be gleaned from the pages_{1g,2m,4k} stats. But with the TDP MMU, where the shadow MMU is only used for nested TDP, it becomes extremely difficult, if not impossible, to determine which SPTEs are coming from the TDP MMU, and which are coming from the shadow MMU. E.g. when triaging/debugging shadow MMU performance issues due to "too many shadow pages", being able to observe that 99%+ of all shadow pages are unsync is critical to being able to deduce that KVM is effectively leaking shadow pages. Signed-off-by: Sean Christopherson Message-ID: <20260612133727.411902-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu/mmu.c | 14 +++++++------- arch/x86/kvm/mmu/mmutrace.h | 2 +- arch/x86/kvm/x86.c | 1 + 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index eee473717c0e..9347c2b62cba 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1434,7 +1434,6 @@ enum kvm_mmu_type { }; struct kvm_arch { - unsigned long n_used_mmu_pages; unsigned long n_requested_mmu_pages; unsigned long n_max_mmu_pages; unsigned int indirect_shadow_pages; @@ -1700,6 +1699,7 @@ struct kvm_vm_stat { u64 mmu_recycled; u64 mmu_cache_miss; u64 mmu_unsync; + u64 mmu_shadow_pages; union { struct { atomic64_t pages_4k; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 26ed97efda91..bb09a9af3a35 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1801,13 +1801,13 @@ static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp) static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - kvm->arch.n_used_mmu_pages++; + kvm->stat.mmu_shadow_pages++; kvm_account_pgtable_pages((void *)sp->spt, +1); } static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { - kvm->arch.n_used_mmu_pages--; + kvm->stat.mmu_shadow_pages--; kvm_account_pgtable_pages((void *)sp->spt, -1); } @@ -2835,9 +2835,9 @@ restart: static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm) { - if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) + if (kvm->arch.n_max_mmu_pages > kvm->stat.mmu_shadow_pages) return kvm->arch.n_max_mmu_pages - - kvm->arch.n_used_mmu_pages; + kvm->stat.mmu_shadow_pages; return 0; } @@ -2873,11 +2873,11 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) { write_lock(&kvm->mmu_lock); - if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { - kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages - + if (kvm->stat.mmu_shadow_pages > goal_nr_mmu_pages) { + kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->stat.mmu_shadow_pages - goal_nr_mmu_pages); - goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; + goal_nr_mmu_pages = kvm->stat.mmu_shadow_pages; } kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index fa01719baf8d..8354d9f39777 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -303,7 +303,7 @@ TRACE_EVENT( TP_fast_assign( __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen; - __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages; + __entry->mmu_used_pages = kvm->stat.mmu_shadow_pages; ), TP_printk("kvm-mmu-valid-gen %u used_pages %x", diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 96c465040756..afcac1042947 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -244,6 +244,7 @@ const struct kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_COUNTER(VM, mmu_recycled), STATS_DESC_COUNTER(VM, mmu_cache_miss), STATS_DESC_ICOUNTER(VM, mmu_unsync), + STATS_DESC_ICOUNTER(VM, mmu_shadow_pages), STATS_DESC_ICOUNTER(VM, pages_4k), STATS_DESC_ICOUNTER(VM, pages_2m), STATS_DESC_ICOUNTER(VM, pages_1g), -- cgit v1.2.3 From 098e32cba334da0f3fa8cfd4e022ae7c72341400 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 12 Jun 2026 11:54:59 -0700 Subject: x86/apic: KVM: Use cpu_physical_id() to get APIC ID of running vCPU for AVIC Use cpu_physical_id() instead of default_cpu_present_to_apicid() when getting the APIC ID of the pCPU on which a vCPU is running/loaded, as the kernel has gone way off the rails if a vCPU is loaded on a pCPU that has been physically removed from the system. Even if the impossible were to happen, the absolutely worst case scenario is that hardware will ring the AIVC doorbell on the wrong pCPU, i.e. a severely broken system will experience mild performance issues. Kill off KVM's superfluous kvm_cpu_get_apicid() wrapper along with the for-KVM export of default_cpu_present_to_apicid(), as they existed purely for the wonky AVIC usage. Cc: Kai Huang Cc: Yosry Ahmed Signed-off-by: Sean Christopherson Acked-by: Naveen N Rao (AMD) Reviewed-by: Kai Huang Reviewed-by: Yosry Ahmed Message-ID: <20260612185459.591892-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 10 ---------- arch/x86/kernel/apic/apic_common.c | 1 - arch/x86/kvm/svm/avic.c | 6 +++--- 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9347c2b62cba..5f6c1ce9673b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2525,16 +2525,6 @@ static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) kvm_x86_call(vcpu_unblocking)(vcpu); } -static inline int kvm_cpu_get_apicid(int mps_cpu) -{ -#ifdef CONFIG_X86_LOCAL_APIC - return default_cpu_present_to_apicid(mps_cpu); -#else - WARN_ON_ONCE(1); - return BAD_APICID; -#endif -} - int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); #define KVM_CLOCK_VALID_FLAGS \ diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c index 2ed3b5c88c7f..45e6b816353e 100644 --- a/arch/x86/kernel/apic/apic_common.c +++ b/arch/x86/kernel/apic/apic_common.c @@ -26,7 +26,6 @@ u32 default_cpu_present_to_apicid(int mps_cpu) else return BAD_APICID; } -EXPORT_SYMBOL_FOR_KVM(default_cpu_present_to_apicid); /* * Set up the logical destination ID when the APIC operates in logical diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 0726f88e679a..58e493a80cb0 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -460,8 +460,8 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu) int cpu = READ_ONCE(vcpu->cpu); if (cpu != get_cpu()) { - wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); - trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(cpu)); + wrmsrq(MSR_AMD64_SVM_AVIC_DOORBELL, cpu_physical_id(cpu)); + trace_kvm_avic_doorbell(vcpu->vcpu_id, cpu_physical_id(cpu)); } put_cpu(); } @@ -1013,7 +1013,7 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu, enum avic_vcpu_action action) { struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm); - int h_physical_id = kvm_cpu_get_apicid(cpu); + int h_physical_id = cpu_physical_id(cpu); struct vcpu_svm *svm = to_svm(vcpu); unsigned long flags; u64 entry; -- cgit v1.2.3