summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-04-17 07:18:03 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-04-17 07:18:03 -0700
commit01f492e1817e858d1712f2489d0afbaa552f417b (patch)
tree9ba6df223570acd45ccb2ba647407f75f4393eab /include/linux
parente55d98e7756135f32150b9b8f75d580d0d4b2dd3 (diff)
parent6b802031877a995456c528095c41d1948546bf45 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini: "Arm: - Add support for tracing in the standalone EL2 hypervisor code, which should help both debugging and performance analysis. This uses the new infrastructure for 'remote' trace buffers that can be exposed by non-kernel entities such as firmware, and which came through the tracing tree - Add support for GICv5 Per Processor Interrupts (PPIs), as the starting point for supporting the new GIC architecture in KVM - Finally add support for pKVM protected guests, where pages are unmapped from the host as they are faulted into the guest and can be shared back from the guest using pKVM hypercalls. Protected guests are created using a new machine type identifier. As the elusive guestmem has not yet delivered on its promises, anonymous memory is also supported This is only a first step towards full isolation from the host; for example, the CPU register state and DMA accesses are not yet isolated. Because this does not really yet bring fully what it promises, it is hidden behind CONFIG_ARM_PKVM_GUEST + 'kvm-arm.mode=protected', and also triggers TAINT_USER when a VM is created. Caveat emptor - Rework the dreaded user_mem_abort() function to make it more maintainable, reducing the amount of state being exposed to the various helpers and rendering a substantial amount of state immutable - Expand the Stage-2 page table dumper to support NV shadow page tables on a per-VM basis - Tidy up the pKVM PSCI proxy code to be slightly less hard to follow - Fix both SPE and TRBE in non-VHE configurations so that they do not generate spurious, out of context table walks that ultimately lead to very bad HW lockups - A small set of patches fixing the Stage-2 MMU freeing in error cases - Tighten-up accepted SMC immediate value to be only #0 for host SMCCC calls - The usual cleanups and other selftest churn LoongArch: - Use CSR_CRMD_PLV for kvm_arch_vcpu_in_kernel() - Add DMSINTC irqchip in kernel support RISC-V: - Fix steal time shared memory alignment checks - Fix vector context allocation leak - Fix array out-of-bounds in pmu_ctr_read() and pmu_fw_ctr_read_hi() - Fix double-free of sdata in kvm_pmu_clear_snapshot_area() - Fix integer overflow in kvm_pmu_validate_counter_mask() - Fix shift-out-of-bounds in make_xfence_request() - Fix lost write protection on huge pages during dirty logging - Split huge pages during fault handling for dirty logging - Skip CSR restore if VCPU is reloaded on the same core - Implement kvm_arch_has_default_irqchip() for KVM selftests - Factored-out ISA checks into separate sources - Added hideleg to struct kvm_vcpu_config - Factored-out VCPU config into separate sources - Support configuration of per-VM HGATP mode from KVM user space s390: - Support for ESA (31-bit) guests inside nested hypervisors - Remove restriction on memslot alignment, which is not needed anymore with the new gmap code - Fix LPSW/E to update the bear (which of course is the breaking event address register) x86: - Shut up various UBSAN warnings on reading module parameter before they were initialized - Don't zero-allocate page tables that are used for splitting hugepages in the TDP MMU, as KVM is guaranteed to set all SPTEs in the page table and thus write all bytes - As an optimization, bail early when trying to unsync 4KiB mappings if the target gfn can just be mapped with a 2MiB hugepage x86 generic: - Copy single-chunk MMIO write values into struct kvm_vcpu (more precisely struct kvm_mmio_fragment) to fix use-after-free stack bugs where KVM would dereference stack pointer after an exit to userspace - Clean up and comment the emulated MMIO code to try to make it easier to maintain (not necessarily "easy", but "easier") - Move VMXON+VMXOFF and EFER.SVME toggling out of KVM (not *all* of VMX and SVM enabling) as it is needed for trusted I/O - Advertise support for AVX512 Bit Matrix Multiply (BMM) instructions - Immediately fail the build if a required #define is missing in one of KVM's headers that is included multiple times - Reject SET_GUEST_DEBUG with -EBUSY if there's an already injected exception, mostly to prevent syzkaller from abusing the uAPI to trigger WARNs, but also because it can help prevent userspace from unintentionally crashing the VM - Exempt SMM from CPUID faulting on Intel, as per the spec - Misc hardening and cleanup changes x86 (AMD): - Fix and optimize IRQ window inhibit handling for AVIC; make it per-vCPU so that KVM doesn't prematurely re-enable AVIC if multiple vCPUs have to-be-injected IRQs - Clean up and optimize the OSVW handling, avoiding a bug in which KVM would overwrite state when enabling virtualization on multiple CPUs in parallel. This should not be a problem because OSVW should usually be the same for all CPUs - Drop a WARN in KVM_MEMORY_ENCRYPT_REG_REGION where KVM complains about a "too large" size based purely on user input - Clean up and harden the pinning code for KVM_MEMORY_ENCRYPT_REG_REGION - Disallow synchronizing a VMSA of an already-launched/encrypted vCPU, as doing so for an SNP guest will crash the host due to an RMP violation page fault - Overhaul KVM's APIs for detecting SEV+ guests so that VM-scoped queries are required to hold kvm->lock, and enforce it by lockdep. Fix various bugs where sev_guest() was not ensured to be stable for the whole duration of a function or ioctl - Convert a pile of kvm->lock SEV code to guard() - Play nicer with userspace that does not enable KVM_CAP_EXCEPTION_PAYLOAD, for which KVM needs to set CR2 and DR6 as a response to ioctls such as KVM_GET_VCPU_EVENTS (even if the payload would end up in EXITINFO2 rather than CR2, for example). Only set CR2 and DR6 when consumption of the payload is imminent, but on the other hand force delivery of the payload in all paths where userspace retrieves CR2 or DR6 - Use vcpu->arch.cr2 when updating vmcb12's CR2 on nested #VMEXIT instead of vmcb02->save.cr2. The value is out of sync after a save/restore or after a #PF is injected into L2 - Fix a class of nSVM bugs where some fields written by the CPU are not synchronized from vmcb02 to cached vmcb12 after VMRUN, and so are not up-to-date when saved by KVM_GET_NESTED_STATE - Fix a class of bugs where the ordering between KVM_SET_NESTED_STATE and KVM_SET_{S}REGS could cause vmcb02 to be incorrectly initialized after save+restore - Add a variety of missing nSVM consistency checks - Fix several bugs where KVM failed to correctly update VMCB fields on nested #VMEXIT - Fix several bugs where KVM failed to correctly synthesize #UD or #GP for SVM-related instructions - Add support for save+restore of virtualized LBRs (on SVM) - Refactor various helpers and macros to improve clarity and (hopefully) make the code easier to maintain - Aggressively sanitize fields when copying from vmcb12, to guard against unintentionally allowing L1 to utilize yet-to-be-defined features - Fix several bugs where KVM botched rAX legality checks when emulating SVM instructions. There are remaining issues in that KVM doesn't handle size prefix overrides for 64-bit guests - Fail emulation of VMRUN/VMLOAD/VMSAVE if mapping vmcb12 fails instead of somewhat arbitrarily synthesizing #GP (i.e. don't double down on AMD's architectural but sketchy behavior of generating #GP for "unsupported" addresses) - Cache all used vmcb12 fields to further harden against TOCTOU bugs x86 (Intel): - Drop obsolete branch hint prefixes from the VMX instruction macros - Use ASM_INPUT_RM() in __vmcs_writel() to coerce clang into using a register input when appropriate - Code cleanups guest_memfd: - Don't mark guest_memfd folios as accessed, as guest_memfd doesn't support reclaim, the memory is unevictable, and there is no storage to write back to LoongArch selftests: - Add KVM PMU test cases s390 selftests: - Enable more memory selftests x86 selftests: - Add support for Hygon CPUs in KVM selftests - Fix a bug in the MSR test where it would get false failures on AMD/Hygon CPUs with exactly one of RDPID or RDTSCP - Add an MADV_COLLAPSE testcase for guest_memfd as a regression test for a bug where the kernel would attempt to collapse guest_memfd folios against KVM's will" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (373 commits) KVM: x86: use inlines instead of macros for is_sev_*guest x86/virt: Treat SVM as unsupported when running as an SEV+ guest KVM: SEV: Goto an existing error label if charging misc_cg for an ASID fails KVM: SVM: Move lock-protected allocation of SEV ASID into a separate helper KVM: SEV: use mutex guard in snp_handle_guest_req() KVM: SEV: use mutex guard in sev_mem_enc_unregister_region() KVM: SEV: use mutex guard in sev_mem_enc_ioctl() KVM: SEV: use mutex guard in snp_launch_update() KVM: SEV: Assert that kvm->lock is held when querying SEV+ support KVM: SEV: Document that checking for SEV+ guests when reclaiming memory is "safe" KVM: SEV: Hide "struct kvm_sev_info" behind CONFIG_KVM_AMD_SEV=y KVM: SEV: WARN on unhandled VM type when initializing VM KVM: LoongArch: selftests: Add PMU overflow interrupt test KVM: LoongArch: selftests: Add basic PMU event counting test KVM: LoongArch: selftests: Add cpucfg read/write helpers LoongArch: KVM: Add DMSINTC inject msi to vCPU LoongArch: KVM: Add DMSINTC device support LoongArch: KVM: Make vcpu_is_preempted() as a macro rather than function LoongArch: KVM: Move host CSR_GSTAT save and restore in context switch LoongArch: KVM: Move host CSR_EENTRY save and restore in context switch ...
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/irqchip/arm-gic-v5.h27
-rw-r--r--include/linux/kvm_host.h27
2 files changed, 44 insertions, 10 deletions
diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h
index b78488df6c98..40d2fce68294 100644
--- a/include/linux/irqchip/arm-gic-v5.h
+++ b/include/linux/irqchip/arm-gic-v5.h
@@ -25,6 +25,28 @@
#define GICV5_HWIRQ_TYPE_SPI UL(0x3)
/*
+ * Architected PPIs
+ */
+#define GICV5_ARCH_PPI_S_DB_PPI 0x0
+#define GICV5_ARCH_PPI_RL_DB_PPI 0x1
+#define GICV5_ARCH_PPI_NS_DB_PPI 0x2
+#define GICV5_ARCH_PPI_SW_PPI 0x3
+#define GICV5_ARCH_PPI_HACDBSIRQ 0xf
+#define GICV5_ARCH_PPI_CNTHVS 0x13
+#define GICV5_ARCH_PPI_CNTHPS 0x14
+#define GICV5_ARCH_PPI_PMBIRQ 0x15
+#define GICV5_ARCH_PPI_COMMIRQ 0x16
+#define GICV5_ARCH_PPI_PMUIRQ 0x17
+#define GICV5_ARCH_PPI_CTIIRQ 0x18
+#define GICV5_ARCH_PPI_GICMNT 0x19
+#define GICV5_ARCH_PPI_CNTHP 0x1a
+#define GICV5_ARCH_PPI_CNTV 0x1b
+#define GICV5_ARCH_PPI_CNTHV 0x1c
+#define GICV5_ARCH_PPI_CNTPS 0x1d
+#define GICV5_ARCH_PPI_CNTP 0x1e
+#define GICV5_ARCH_PPI_TRBIRQ 0x1f
+
+/*
* Tables attributes
*/
#define GICV5_NO_READ_ALLOC 0b0
@@ -365,6 +387,11 @@ int gicv5_spi_irq_set_type(struct irq_data *d, unsigned int type);
int gicv5_irs_iste_alloc(u32 lpi);
void gicv5_irs_syncr(void);
+/* Embedded in kvm.arch */
+struct gicv5_vpe {
+ bool resident;
+};
+
struct gicv5_its_devtab_cfg {
union {
struct {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6b76e7a6f4c2..4c14aee1fb06 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -318,7 +318,8 @@ static inline bool kvm_vcpu_can_poll(ktime_t cur, ktime_t stop)
struct kvm_mmio_fragment {
gpa_t gpa;
void *data;
- unsigned len;
+ u64 val;
+ unsigned int len;
};
struct kvm_vcpu {
@@ -1029,6 +1030,13 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id)
return NULL;
}
+static inline bool kvm_is_vcpu_creation_in_progress(struct kvm *kvm)
+{
+ lockdep_assert_held(&kvm->lock);
+
+ return kvm->created_vcpus != atomic_read(&kvm->online_vcpus);
+}
+
void kvm_destroy_vcpus(struct kvm *kvm);
int kvm_trylock_all_vcpus(struct kvm *kvm);
@@ -1628,6 +1636,13 @@ static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {}
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
/*
+ * kvm_arch_shutdown() is invoked immediately prior to forcefully disabling
+ * hardware virtualization on all CPUs via IPI function calls (in preparation
+ * for shutdown or reboot), e.g. to allow arch code to prepare for disabling
+ * virtualization while KVM may be actively running vCPUs.
+ */
+void kvm_arch_shutdown(void);
+/*
* kvm_arch_{enable,disable}_virtualization() are called on one CPU, under
* kvm_usage_lock, immediately after/before 0=>1 and 1=>0 transitions of
* kvm_usage_count, i.e. at the beginning of the generic hardware enabling
@@ -2300,7 +2315,6 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
extern bool enable_virt_at_load;
-extern bool kvm_rebooting;
#endif
extern unsigned int halt_poll_ns;
@@ -2366,6 +2380,7 @@ void kvm_unregister_device_ops(u32 type);
extern struct kvm_device_ops kvm_mpic_ops;
extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
+extern struct kvm_device_ops kvm_arm_vgic_v5_ops;
#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
@@ -2594,12 +2609,4 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
struct kvm_pre_fault_memory *range);
#endif
-#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
-int kvm_enable_virtualization(void);
-void kvm_disable_virtualization(void);
-#else
-static inline int kvm_enable_virtualization(void) { return 0; }
-static inline void kvm_disable_virtualization(void) { }
-#endif
-
#endif