summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2026-05-29 19:28:16 +0200
committerPaolo Bonzini <pbonzini@redhat.com>2026-05-29 19:28:16 +0200
commitb3978970165729eea7a80d10d52f17cc495672cd (patch)
treeadfa8ff418b8be54595f53d62f89603ce700c272 /arch
parente7ae89a0c97ce2b68b0983cd01eda67cf373517d (diff)
parenta9e18aa3263f356edae305e29830e5fe63d8597a (diff)
Merge tag 'kvm-x86-fixes-7.1-rc6' of https://github.com/kvm-x86/linux into HEAD
KVM x86 fixes for 7.1-rcN - Include the kernel's linux/mman.h in KVM selftests to ensure MADV_COLLAPSE is defined, as older libc versions may not provide it. - Include execinfo.h if and only if KVM selftests are building against glibc, and provide a test_dump_stack() for non-glibc builds. - Fudge around an RCU splat in the emegerncy reboot code that is technically a legitimate flaw, but in practice is a non-issue and fixing the flaw, e.g. by adding locking, would incur meaningful risk, i.e. do more harm than good. - Rate-limit global clock updates once again (but without delayed work), as KVM was subtly relying on the old rate-limiting for NPT correction to guard against "update storms" when running without a master clock on systems with overcommitted CPUs. - Fix a brown paper bag goof where KVM checked if ERAPS is "dirty" instead of marking it dirty when emulating INVPCID. - Flush the TLB when transitioning from xAVIC => x2AVIC to ensure the CPU TLB doesn't contain AVIC-tagged entries for the APIC base GPA.
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/include/asm/kvm_host.h1
-rw-r--r--arch/x86/kvm/svm/avic.c35
-rw-r--r--arch/x86/kvm/x86.c13
-rw-r--r--arch/x86/virt/hw.c15
4 files changed, 54 insertions, 10 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c470e40a00aa..f14009f25a3b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1504,6 +1504,7 @@ struct kvm_arch {
bool use_master_clock;
u64 master_kernel_ns;
u64 master_cycle_now;
+ struct ratelimit_state kvmclock_update_rs;
#ifdef CONFIG_KVM_HYPERV
struct kvm_hv hyperv;
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 993b551180fe..cdd5a6dc646f 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -207,6 +207,35 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
svm_clr_intercept(svm, INTERCEPT_CR8_WRITE);
/*
+ * Flush the TLB when enabling (x2)AVIC and when transitioning between
+ * xAVIC and x2AVIC, as the CPU may have inserted a TLB entry for the
+ * "wrong" mapping.
+ *
+ * KVM uses a per-VM "scratch" page to back the APIC memslot, because
+ * KVM also uses per-VM page tables *and* maintains the page table (NPT
+ * or shadow page) mappings for said memslot even if one or more vCPUs
+ * have their local APIC hardware-disabled or are in x2APIC mode, i.e.
+ * even if one or more vCPUs' APIC MMIO BAR is effectively disabled.
+ *
+ * If xAVIC is fully enabled, hardware ignores the physical address in
+ * KVM's page tables, i.e. in the leaf SPTE for the APIC memslot, and
+ * instead redirects the access to the AVIC backing page, i.e. to the
+ * vCPU's virtual APIC page. If xAVIC is not enabled (APIC is either
+ * hardware-disabled or in x2APIC mode), then guest accesses will use
+ * the page table mapping verbatim, i.e. will access the per-VM scratch
+ * page, as normal memory.
+ *
+ * In both cases, the CPU is allowed to cache TLB entries for the APIC
+ * base GPA. So, KVM needs to flush the TLB when enabling xAVIC, as
+ * accesses need to be redirected to the virtual APIC page, but the TLB
+ * may contain entries pointing at the scratch page. KVM also needs to
+ * flush the TLB when enabling x2AVIC, as accesses need to go to the
+ * scratch page, but the TLB may contain entries tagged as xAVIC, i.e.
+ * entries pointing to the vCPU's virtual APIC page.
+ */
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
+
+ /*
* Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
* accesses, while interrupt injection to a running vCPU can be
* achieved using AVIC doorbell. KVM disables the APIC access page
@@ -219,12 +248,6 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
/* Disabling MSR intercept for x2APIC registers */
avic_set_x2apic_msr_interception(svm, false);
} else {
- /*
- * Flush the TLB, the guest may have inserted a non-APIC
- * mapping into the TLB while AVIC was disabled.
- */
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
-
/* Enabling MSR intercept for x2APIC registers */
avic_set_x2apic_msr_interception(svm, true);
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c1a72d749084..0550359ed798 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5227,8 +5227,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
* On a host with synchronized TSC, there is no need to update
* kvmclock on vcpu->cpu migration
*/
- if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
- kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
+ if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) {
+ if (__ratelimit(&vcpu->kvm->arch.kvmclock_update_rs))
+ kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
+ else
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ }
+
if (vcpu->cpu != cpu)
kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
vcpu->cpu = cpu;
@@ -13366,6 +13371,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
+ ratelimit_state_init(&kvm->arch.kvmclock_update_rs, HZ, 10);
+ ratelimit_set_flags(&kvm->arch.kvmclock_update_rs, RATELIMIT_MSG_ON_RELEASE);
kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
@@ -14323,7 +14330,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
* the RAP (Return Address Predicator).
*/
if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS))
- kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS);
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_ERAPS);
kvm_invalidate_pcid(vcpu, operand.pcid);
return kvm_skip_emulated_instruction(vcpu);
diff --git a/arch/x86/virt/hw.c b/arch/x86/virt/hw.c
index f647557d38ac..7e9091c640be 100644
--- a/arch/x86/virt/hw.c
+++ b/arch/x86/virt/hw.c
@@ -49,7 +49,20 @@ static void x86_virt_invoke_kvm_emergency_callback(void)
{
cpu_emergency_virt_cb *kvm_callback;
- kvm_callback = rcu_dereference(kvm_emergency_callback);
+ /*
+ * RCU may not be watching the crashing CPU here, so rcu_dereference()
+ * triggers a suspicious-RCU-usage splat. In principle, a concurrent
+ * KVM module unload could race with this read; see commit 2baa33a8ddd6
+ * ("KVM: x86: Leave user-return notifier registered on reboot/shutdown")
+ * which notes that nothing prevents module unload during panic/reboot.
+ *
+ * However, taking a lock here would be riskier than the current race:
+ * the system is going down via NMI shootdown, and any lock could be
+ * held by an already-stopped CPU. Use rcu_dereference_raw() to silence
+ * the lockdep splat and accept the comically small remaining race;
+ * panic context inherently cannot guarantee complete correctness.
+ */
+ kvm_callback = rcu_dereference_raw(kvm_emergency_callback);
if (kvm_callback)
kvm_callback();
}