// SPDX-License-Identifier: GPL-2.0-only #include #include #include #include #include #include #include #include #include #include struct x86_virt_ops { int feature; int (*enable_virtualization_cpu)(void); int (*disable_virtualization_cpu)(void); void (*emergency_disable_virtualization_cpu)(void); }; static struct x86_virt_ops virt_ops __ro_after_init; __visible bool virt_rebooting; EXPORT_SYMBOL_FOR_KVM(virt_rebooting); static DEFINE_PER_CPU(int, virtualization_nr_users); static cpu_emergency_virt_cb __rcu *kvm_emergency_callback; void x86_virt_register_emergency_callback(cpu_emergency_virt_cb *callback) { if (WARN_ON_ONCE(rcu_access_pointer(kvm_emergency_callback))) return; rcu_assign_pointer(kvm_emergency_callback, callback); } EXPORT_SYMBOL_FOR_KVM(x86_virt_register_emergency_callback); void x86_virt_unregister_emergency_callback(cpu_emergency_virt_cb *callback) { if (WARN_ON_ONCE(rcu_access_pointer(kvm_emergency_callback) != callback)) return; rcu_assign_pointer(kvm_emergency_callback, NULL); synchronize_rcu(); } EXPORT_SYMBOL_FOR_KVM(x86_virt_unregister_emergency_callback); static void x86_virt_invoke_kvm_emergency_callback(void) { cpu_emergency_virt_cb *kvm_callback; kvm_callback = rcu_dereference(kvm_emergency_callback); if (kvm_callback) kvm_callback(); } #if IS_ENABLED(CONFIG_KVM_INTEL) static DEFINE_PER_CPU(struct vmcs *, root_vmcs); static int x86_virt_cpu_vmxon(void) { u64 vmxon_pointer = __pa(per_cpu(root_vmcs, raw_smp_processor_id())); u64 msr; cr4_set_bits(X86_CR4_VMXE); asm goto("1: vmxon %[vmxon_pointer]\n\t" _ASM_EXTABLE(1b, %l[fault]) : : [vmxon_pointer] "m"(vmxon_pointer) : : fault); return 0; fault: WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); cr4_clear_bits(X86_CR4_VMXE); return -EFAULT; } static int x86_vmx_enable_virtualization_cpu(void) { int r; if (cr4_read_shadow() & X86_CR4_VMXE) return -EBUSY; intel_pt_handle_vmx(1); r = x86_virt_cpu_vmxon(); if (r) { intel_pt_handle_vmx(0); return r; } return 0; } /* * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) * * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to * atomically track post-VMXON state, e.g. this may be called in NMI context. * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. * faults are guaranteed to be due to the !post-VMXON check unless the CPU is * magically in RM, VM86, compat mode, or at CPL>0. */ static int x86_vmx_disable_virtualization_cpu(void) { int r = -EIO; asm goto("1: vmxoff\n\t" _ASM_EXTABLE(1b, %l[fault]) ::: "cc", "memory" : fault); r = 0; fault: cr4_clear_bits(X86_CR4_VMXE); intel_pt_handle_vmx(0); return r; } static void x86_vmx_emergency_disable_virtualization_cpu(void) { virt_rebooting = true; /* * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be * set in task context. If this races with _another_ emergency call * from NMI context, VMCLEAR (in KVM) and VMXOFF may #UD, but KVM and * the kernel will eat those faults due to virt_rebooting being set by * the interrupting NMI callback. */ if (!(__read_cr4() & X86_CR4_VMXE)) return; x86_virt_invoke_kvm_emergency_callback(); x86_vmx_disable_virtualization_cpu(); } static __init void x86_vmx_exit(void) { int cpu; for_each_possible_cpu(cpu) { free_page((unsigned long)per_cpu(root_vmcs, cpu)); per_cpu(root_vmcs, cpu) = NULL; } } static __init int __x86_vmx_init(void) { const struct x86_virt_ops vmx_ops = { .feature = X86_FEATURE_VMX, .enable_virtualization_cpu = x86_vmx_enable_virtualization_cpu, .disable_virtualization_cpu = x86_vmx_disable_virtualization_cpu, .emergency_disable_virtualization_cpu = x86_vmx_emergency_disable_virtualization_cpu, }; u64 basic_msr; u32 rev_id; int cpu; if (!cpu_feature_enabled(X86_FEATURE_VMX)) return -EOPNOTSUPP; rdmsrq(MSR_IA32_VMX_BASIC, basic_msr); /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ if (WARN_ON_ONCE(vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE)) return -EIO; /* * Even if eVMCS is enabled (or will be enabled?), and even though not * explicitly documented by TLFS, the root VMCS passed to VMXON should * still be marked with the revision_id reported by the physical CPU. */ rev_id = vmx_basic_vmcs_revision_id(basic_msr); for_each_possible_cpu(cpu) { int node = cpu_to_node(cpu); struct page *page; struct vmcs *vmcs; page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (WARN_ON_ONCE(!page)) { x86_vmx_exit(); return -ENOMEM; } vmcs = page_address(page); vmcs->hdr.revision_id = rev_id; per_cpu(root_vmcs, cpu) = vmcs; } memcpy(&virt_ops, &vmx_ops, sizeof(virt_ops)); return 0; } static __init int x86_vmx_init(void) { int r; r = __x86_vmx_init(); if (r) setup_clear_cpu_cap(X86_FEATURE_VMX); return r; } #else static __init int x86_vmx_init(void) { return -EOPNOTSUPP; } static __init void x86_vmx_exit(void) { } #endif #if IS_ENABLED(CONFIG_KVM_AMD) static int x86_svm_enable_virtualization_cpu(void) { u64 efer; rdmsrq(MSR_EFER, efer); if (efer & EFER_SVME) return -EBUSY; wrmsrq(MSR_EFER, efer | EFER_SVME); return 0; } static int x86_svm_disable_virtualization_cpu(void) { int r = -EIO; u64 efer; /* * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and * NMI aren't blocked. */ asm goto("1: stgi\n\t" _ASM_EXTABLE(1b, %l[fault]) ::: "memory" : fault); r = 0; fault: rdmsrq(MSR_EFER, efer); wrmsrq(MSR_EFER, efer & ~EFER_SVME); return r; } static void x86_svm_emergency_disable_virtualization_cpu(void) { u64 efer; virt_rebooting = true; rdmsrq(MSR_EFER, efer); if (!(efer & EFER_SVME)) return; x86_virt_invoke_kvm_emergency_callback(); x86_svm_disable_virtualization_cpu(); } static __init int x86_svm_init(void) { const struct x86_virt_ops svm_ops = { .feature = X86_FEATURE_SVM, .enable_virtualization_cpu = x86_svm_enable_virtualization_cpu, .disable_virtualization_cpu = x86_svm_disable_virtualization_cpu, .emergency_disable_virtualization_cpu = x86_svm_emergency_disable_virtualization_cpu, }; if (!cpu_feature_enabled(X86_FEATURE_SVM) || cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) return -EOPNOTSUPP; memcpy(&virt_ops, &svm_ops, sizeof(virt_ops)); return 0; } #else static __init int x86_svm_init(void) { return -EOPNOTSUPP; } #endif int x86_virt_get_ref(int feat) { int r; /* Ensure the !feature check can't get false positives. */ BUILD_BUG_ON(!X86_FEATURE_SVM || !X86_FEATURE_VMX); if (!virt_ops.feature || virt_ops.feature != feat) return -EOPNOTSUPP; guard(preempt)(); if (this_cpu_inc_return(virtualization_nr_users) > 1) return 0; r = virt_ops.enable_virtualization_cpu(); if (r) WARN_ON_ONCE(this_cpu_dec_return(virtualization_nr_users)); return r; } EXPORT_SYMBOL_FOR_KVM(x86_virt_get_ref); void x86_virt_put_ref(int feat) { guard(preempt)(); if (WARN_ON_ONCE(!this_cpu_read(virtualization_nr_users)) || this_cpu_dec_return(virtualization_nr_users)) return; BUG_ON(virt_ops.disable_virtualization_cpu() && !virt_rebooting); } EXPORT_SYMBOL_FOR_KVM(x86_virt_put_ref); /* * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if * GIF=0, i.e. if the crash occurred between CLGI and STGI. */ int x86_virt_emergency_disable_virtualization_cpu(void) { if (!virt_ops.feature) return -EOPNOTSUPP; /* * IRQs must be disabled as virtualization is enabled in hardware via * function call IPIs, i.e. IRQs need to be disabled to guarantee * virtualization stays disabled. */ lockdep_assert_irqs_disabled(); /* * Do the NMI shootdown even if virtualization is off on _this_ CPU, as * other CPUs may have virtualization enabled. * * TODO: Track whether or not virtualization might be enabled on other * CPUs? May not be worth avoiding the NMI shootdown... */ virt_ops.emergency_disable_virtualization_cpu(); return 0; } void __init x86_virt_init(void) { /* * Attempt to initialize both SVM and VMX, and simply use whichever one * is present. Rsefuse to enable/use SVM or VMX if both are somehow * supported. No known CPU supports both SVM and VMX. */ bool has_vmx = !x86_vmx_init(); bool has_svm = !x86_svm_init(); if (WARN_ON_ONCE(has_vmx && has_svm)) { x86_vmx_exit(); memset(&virt_ops, 0, sizeof(virt_ops)); } }