summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2026-05-04 04:44:52 -0400
committerPaolo Bonzini <pbonzini@redhat.com>2026-05-10 15:01:06 +0200
commit2be108307eae241359bb32ee259ba0b5378156aa (patch)
treeaa0f474611cb650f7a3ecf5713cda62f392279f1
parent6d35786de28116ecf78797a62b84e6bf3c45aa5a (diff)
parent687ee95c1b6d7509a25bbc661e62801aa06cc2ae (diff)
Merge branch 'kvm-mbec' into HEAD
This topic branch introduces support for two related features that Hyper-V uses in its implementation of Virtual Secure Mode; these are Intel Mode-Based Execute Control and AMD Guest Mode Execution Trap. Both MBEC and GMET allow more granular control over execute permissions, with different levels of separation between supervisor and user mode. MBEC provides support for separate supervisor and user-mode bits in the PTEs; GMET instead lacks supervisor-mode only execution (with NX=0, "both" is represented by U=0 and user-mode only by U=1). GMET was clearly inspired by SMEP though with some differences and annoyances. The implementation starts from two changes to core MMU code, both of which help making the actual feature almost trivial to implement: - first, I'm cleaning up the implementation of nVMX exec-only, by properly adding read permissions to the ACC_* constant and to the permission bitmask machinery. Jon also had to add a fourth ACC_* bit, but used it only in the special case of nested MBEC; here instead ACC_READ_MASK is the normality, which simplifies testing a lot and removes gratuitous complexity. - second, I'm enforcing that KVM runs with MBEC/GMET enabled even in non-nested mode, if it wants to provide the feature to nested hypervisors. This makes the creation of SPTEs looks exactly the same for L1 and L2 guests, despite only the latter using MBEC/GMET fully; the difference lies only in the input access permissions. This strategy adds a limited amount of complexity to the core is limited, while providing for an almost entirely seamless support of nested hypervisors. Later patches have to use slightly different meanings for ACC_* in Intel and AMD. On the Intel side, some work is needed in order to split shadow_x_mask and ACC_EXEC_MASK in two; now that there is an actual ACC_READ_MASK to be used for exec-only pages, ACC_USER_MASK is unused and can be reused as ACC_USER_EXEC_MASK. However, unlike the older ACC_USER_MASK hack these differences are backed by concrete concepts of the page table format, and there is always a 1:1 mapping from ACC_* bits to PT_*_MASK or shadow_*_mask: Intel AMD -------------------- ------------------- ------------------- ACC_READ_MASK PT_PRESENT_MASK PT_PRESENT_MASK ACC_WRITE_MASK PT_WRITABLE_MASK PT_WRITABLE_MASK ACC_EXEC_MASK shadow_xs_mask shadow_nx_mask ACC_USER_MASK --- shadow_user_mask ACC_USER_EXEC_MASK shadow_xu_mask --- On Intel, ACC_EXEC_MASK is used for kernel-mode execution and is tied to shadow_xs_mask (when MBEC is disabled, ACC_USER_EXEC_MASK and the XU bit are computed but ineffective). update_permission_bitmask() precomputes all the necessary conditions. On the AMD side, the U bit maps to ACC_USER_MASK but nNPT adjusts the permission bitmask to ignore it for reads and writes when GMET is active. Despite the smaller scale of the changes compared to MBEC, there are some changes to make to use GMET for L1 guests, because the page tables have to be created with U=0. This means that the root page has role.access != ACC_ALL and its permissions have to be propagated down. Note that with MBEC the user/supervisor distinction depends on the U bit of the page tables rather than the CPL. Processors provide this information to the hypervisor through the "advanced EPT violation vmexit info" feature, which is a requirement for KVM to use MBEC, and kvm-intel.ko passes it to the MMU in PFERR_USER_MASK (unlike kvm-amd.ko which computes it from the CPL). This needs a small change to pass the effective XWU permissions of the page tables down to translate_nested_gpa(). The former "smep_andnot_wp" bit of cpu_role.base, now named "cr4_smep", is repurposed for nested TDP to indicate that MBEC/GMET is on. The minor pessimization for shadow page tables (toggling CR4.SMEP now always forces building a separate version of the shadow page tables, even though that's technically unnecessary if CR4.WP=1) is not really worth fretting about; in practice, guests are not going to flip CR4.SMEP in a way that would prevent efficient reuse of shadow page tables. Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-rw-r--r--Documentation/virt/kvm/x86/mmu.rst10
-rw-r--r--arch/x86/include/asm/cpufeatures.h1
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h48
-rw-r--r--arch/x86/include/asm/svm.h1
-rw-r--r--arch/x86/include/asm/vmx.h14
-rw-r--r--arch/x86/kvm/hyperv.c4
-rw-r--r--arch/x86/kvm/mmu.h30
-rw-r--r--arch/x86/kvm/mmu/mmu.c184
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h19
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h73
-rw-r--r--arch/x86/kvm/mmu/spte.c94
-rw-r--r--arch/x86/kvm/mmu/spte.h70
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c6
-rw-r--r--arch/x86/kvm/svm/nested.c38
-rw-r--r--arch/x86/kvm/svm/svm.c31
-rw-r--r--arch/x86/kvm/svm/svm.h1
-rw-r--r--arch/x86/kvm/vmx/capabilities.h12
-rw-r--r--arch/x86/kvm/vmx/common.h26
-rw-r--r--arch/x86/kvm/vmx/hyperv_evmcs.h1
-rw-r--r--arch/x86/kvm/vmx/main.c9
-rw-r--r--arch/x86/kvm/vmx/nested.c46
-rw-r--r--arch/x86/kvm/vmx/tdx.c2
-rw-r--r--arch/x86/kvm/vmx/vmx.c22
-rw-r--r--arch/x86/kvm/vmx/vmx.h1
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h6
-rw-r--r--arch/x86/kvm/x86.c18
27 files changed, 538 insertions, 230 deletions
diff --git a/Documentation/virt/kvm/x86/mmu.rst b/Documentation/virt/kvm/x86/mmu.rst
index 2b3b6d442302..666aa179601a 100644
--- a/Documentation/virt/kvm/x86/mmu.rst
+++ b/Documentation/virt/kvm/x86/mmu.rst
@@ -184,10 +184,8 @@ Shadow pages contain the following information:
Contains the value of efer.nx for which the page is valid.
role.cr0_wp:
Contains the value of cr0.wp for which the page is valid.
- role.smep_andnot_wp:
- Contains the value of cr4.smep && !cr0.wp for which the page is valid
- (pages for which this is true are different from other pages; see the
- treatment of cr0.wp=0 below).
+ role.cr4_smep:
+ Contains the value of cr4.smep for which the page is valid.
role.smap_andnot_wp:
Contains the value of cr4.smap && !cr0.wp for which the page is valid
(pages for which this is true are different from other pages; see the
@@ -435,8 +433,8 @@ from being written by the kernel after cr0.wp has changed to 1, we make
the value of cr0.wp part of the page role. This means that an spte created
with one value of cr0.wp cannot be used when cr0.wp has a different value -
it will simply be missed by the shadow page lookup code. A similar issue
-exists when an spte created with cr0.wp=0 and cr4.smep=0 is used after
-changing cr4.smep to 1. To avoid this, the value of !cr0.wp && cr4.smep
+exists when an spte created with cr0.wp=0 and cr4.smap=0 is used after
+changing cr4.smap to 1. To avoid this, the value of !cr0.wp && cr4.smap
is also made a part of the page role.
Large pages
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 1d506e5d6f46..35a2a0f9ab32 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -379,6 +379,7 @@
#define X86_FEATURE_AVIC (15*32+13) /* "avic" Virtual Interrupt Controller */
#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* "v_vmsave_vmload" Virtual VMSAVE VMLOAD */
#define X86_FEATURE_VGIF (15*32+16) /* "vgif" Virtual GIF */
+#define X86_FEATURE_GMET (15*32+17) /* Guest Mode Execution Trap */
#define X86_FEATURE_X2AVIC (15*32+18) /* "x2avic" Virtual x2apic */
#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */
#define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 3776cf5382a2..e4fca997ec79 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -94,6 +94,7 @@ KVM_X86_OP_OPTIONAL(sync_pir_to_irr)
KVM_X86_OP_OPTIONAL_RET0(set_tss_addr)
KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr)
KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
+KVM_X86_OP_OPTIONAL_RET0(tdp_has_smep)
KVM_X86_OP(load_mmu_pgd)
KVM_X86_OP_OPTIONAL(link_external_spt)
KVM_X86_OP_OPTIONAL(set_external_spte)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c470e40a00aa..1da3d5c59e15 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -328,11 +328,11 @@ struct kvm_kernel_irq_routing_entry;
* the number of unique SPs that can theoretically be created is 2^n, where n
* is the number of bits that are used to compute the role.
*
- * But, even though there are 20 bits in the mask below, not all combinations
+ * But, even though there are 21 bits in the mask below, not all combinations
* of modes and flags are possible:
*
* - invalid shadow pages are not accounted, mirror pages are not shadowed,
- * so the bits are effectively 18.
+ * so the bits are effectively 19.
*
* - quadrant will only be used if has_4_byte_gpte=1 (non-PAE paging);
* execonly and ad_disabled are only used for nested EPT which has
@@ -343,11 +343,11 @@ struct kvm_kernel_irq_routing_entry;
* paging has exactly one upper level, making level completely redundant
* when has_4_byte_gpte=1.
*
- * - on top of this, smep_andnot_wp and smap_andnot_wp are only set if
- * cr0_wp=0, therefore these three bits only give rise to 5 possibilities.
+ * - on top of this, smap_andnot_wp is only set if cr0_wp=0,
+ * therefore these two bits only give rise to 3 possibilities.
*
* Therefore, the maximum number of possible upper-level shadow pages for a
- * single gfn is a bit less than 2^13.
+ * single gfn is a bit less than 2^14.
*/
union kvm_mmu_page_role {
u32 word;
@@ -356,17 +356,26 @@ union kvm_mmu_page_role {
unsigned has_4_byte_gpte:1;
unsigned quadrant:2;
unsigned direct:1;
- unsigned access:3;
+ unsigned access:4;
unsigned invalid:1;
unsigned efer_nx:1;
unsigned cr0_wp:1;
- unsigned smep_andnot_wp:1;
unsigned smap_andnot_wp:1;
unsigned ad_disabled:1;
unsigned guest_mode:1;
unsigned passthrough:1;
unsigned is_mirror:1;
- unsigned :4;
+
+ /*
+ * cr4_smep is also set for EPT MBEC. Because it affects
+ * which pages are considered non-present (bit 10 additionally
+ * must be zero if MBEC is on) it has to be in the base role.
+ * It also has to be in the base role for AMD GMET because
+ * kernel-executable pages need to have U=0 with GMET enabled.
+ */
+ unsigned cr4_smep:1;
+
+ unsigned:3;
/*
* This is left at the top of the word so that
@@ -392,10 +401,10 @@ union kvm_mmu_page_role {
* tables (because KVM doesn't support Protection Keys with shadow paging), and
* CR0.PG, CR4.PAE, and CR4.PSE are indirectly reflected in role.level.
*
- * Note, SMEP and SMAP are not redundant with sm*p_andnot_wp in the page role.
- * If CR0.WP=1, KVM can reuse shadow pages for the guest regardless of SMEP and
- * SMAP, but the MMU's permission checks for software walks need to be SMEP and
- * SMAP aware regardless of CR0.WP.
+ * Note, SMAP is not redundant with smap_andnot_wp in the page role. If
+ * CR0.WP=1, KVM can reuse shadow pages for the guest regardless of SMAP,
+ * but the MMU's permission checks for software walks need to be SMAP
+ * aware regardless of CR0.WP.
*/
union kvm_mmu_extended_role {
u32 word;
@@ -405,9 +414,15 @@ union kvm_mmu_extended_role {
unsigned int cr4_pse:1;
unsigned int cr4_pke:1;
unsigned int cr4_smap:1;
- unsigned int cr4_smep:1;
unsigned int cr4_la57:1;
unsigned int efer_lma:1;
+
+ /*
+ * True if either CR4.SMEP or EFER.NXE are set. For AMD NPT
+ * this is the "real" host CR4.SMEP whereas cr4_smep is
+ * actually GMET.
+ */
+ unsigned int has_pferr_fetch:1;
};
};
@@ -492,7 +507,7 @@ struct kvm_mmu {
* Byte index: page fault error code [4:1]
* Bit index: pte permissions in ACC_* format
*/
- u8 permissions[16];
+ u16 permissions[16];
u64 *pae_root;
u64 *pml4_root;
@@ -1887,6 +1902,7 @@ struct kvm_x86_ops {
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
u8 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+ bool (*tdp_has_smep)(struct kvm *kvm);
void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
int root_level);
@@ -2010,6 +2026,10 @@ struct kvm_x86_nested_ops {
struct kvm_nested_state *kvm_state);
bool (*get_nested_state_pages)(struct kvm_vcpu *vcpu);
int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
+ gpa_t (*translate_nested_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa,
+ u64 access,
+ struct x86_exception *exception,
+ u64 pte_access);
int (*enable_evmcs)(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version);
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index bcfeb5e7c0ed..aa63431ba92c 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -243,6 +243,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_MISC_ENABLE_NP BIT(0)
#define SVM_MISC_ENABLE_SEV BIT(1)
#define SVM_MISC_ENABLE_SEV_ES BIT(2)
+#define SVM_MISC_ENABLE_GMET BIT(3)
#define SVM_MISC2_ENABLE_V_LBR BIT_ULL(0)
#define SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE BIT_ULL(1)
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 37080382df54..ed2ded531e55 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -535,6 +535,7 @@ enum vmcs_field {
#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
#define VMX_EPT_INVEPT_BIT (1ull << 20)
#define VMX_EPT_AD_BIT (1ull << 21)
+#define VMX_EPT_ADVANCED_VMEXIT_INFO_BIT (1ull << 22)
#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
@@ -560,10 +561,12 @@ enum vmcs_field {
#define VMX_EPT_ACCESS_BIT (1ull << 8)
#define VMX_EPT_DIRTY_BIT (1ull << 9)
#define VMX_EPT_SUPPRESS_VE_BIT (1ull << 63)
+
#define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \
VMX_EPT_WRITABLE_MASK | \
VMX_EPT_EXECUTABLE_MASK)
#define VMX_EPT_MT_MASK (7ull << VMX_EPT_MT_EPTE_SHIFT)
+#define VMX_EPT_USER_EXECUTABLE_MASK (1ull << 10)
static inline u8 vmx_eptp_page_walk_level(u64 eptp)
{
@@ -608,17 +611,24 @@ enum vm_entry_failure_code {
#define EPT_VIOLATION_PROT_READ BIT(3)
#define EPT_VIOLATION_PROT_WRITE BIT(4)
#define EPT_VIOLATION_PROT_EXEC BIT(5)
-#define EPT_VIOLATION_EXEC_FOR_RING3_LIN BIT(6)
+#define EPT_VIOLATION_PROT_USER_EXEC BIT(6)
#define EPT_VIOLATION_PROT_MASK (EPT_VIOLATION_PROT_READ | \
EPT_VIOLATION_PROT_WRITE | \
- EPT_VIOLATION_PROT_EXEC)
+ EPT_VIOLATION_PROT_EXEC | \
+ EPT_VIOLATION_PROT_USER_EXEC)
#define EPT_VIOLATION_GVA_IS_VALID BIT(7)
#define EPT_VIOLATION_GVA_TRANSLATED BIT(8)
+#define EPT_VIOLATION_GVA_USER BIT(9)
+#define EPT_VIOLATION_GVA_WRITABLE BIT(10)
+#define EPT_VIOLATION_GVA_NX BIT(11)
#define EPT_VIOLATION_RWX_TO_PROT(__epte) (((__epte) & VMX_EPT_RWX_MASK) << 3)
+#define EPT_VIOLATION_USER_EXEC_TO_PROT(__epte) (((__epte) & VMX_EPT_USER_EXECUTABLE_MASK) >> 4)
static_assert(EPT_VIOLATION_RWX_TO_PROT(VMX_EPT_RWX_MASK) ==
(EPT_VIOLATION_PROT_READ | EPT_VIOLATION_PROT_WRITE | EPT_VIOLATION_PROT_EXEC));
+static_assert(EPT_VIOLATION_USER_EXEC_TO_PROT(VMX_EPT_USER_EXECUTABLE_MASK) ==
+ (EPT_VIOLATION_PROT_USER_EXEC));
/*
* Exit Qualifications for NOTIFY VM EXIT
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 4438ecac9a89..015c6947b462 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -2041,7 +2041,9 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
* read with kvm_read_guest().
*/
if (!hc->fast && mmu_is_nested(vcpu)) {
- hc->ingpa = translate_nested_gpa(vcpu, hc->ingpa, 0, NULL);
+ hc->ingpa = kvm_x86_ops.nested_ops->translate_nested_gpa(
+ vcpu, hc->ingpa,
+ PFERR_GUEST_FINAL_MASK, NULL, 0);
if (unlikely(hc->ingpa == INVALID_GPA))
return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 830f46145692..ddf4e467c071 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -37,6 +37,13 @@ extern bool __read_mostly enable_mmio_caching;
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3
+#define ACC_READ_MASK PT_PRESENT_MASK
+#define ACC_WRITE_MASK PT_WRITABLE_MASK
+#define ACC_USER_MASK PT_USER_MASK /* non EPT */
+#define ACC_USER_EXEC_MASK ACC_USER_MASK /* EPT only */
+#define ACC_EXEC_MASK 8
+#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK | ACC_READ_MASK)
+
#define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PSE | X86_CR4_PAE | X86_CR4_LA57 | \
X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE)
@@ -76,19 +83,24 @@ static inline gfn_t kvm_mmu_max_gfn(void)
return (1ULL << (max_gpa_bits - PAGE_SHIFT)) - 1;
}
+static inline bool mmu_has_mbec(struct kvm_mmu *mmu)
+{
+ return mmu->root_role.cr4_smep;
+}
+
u8 kvm_mmu_get_max_tdp_level(void);
void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value);
void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask);
-void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
+void kvm_mmu_set_ept_masks(bool has_ad_bits);
void kvm_init_mmu(struct kvm_vcpu *vcpu);
-void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
- unsigned long cr4, u64 efer, gpa_t nested_cr3);
+void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr4,
+ u64 efer, gpa_t nested_cr3, u64 misc_ctl);
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
int huge_page_level, bool accessed_dirty,
- gpa_t new_eptp);
+ bool mbec, gpa_t new_eptp);
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len);
@@ -288,17 +300,17 @@ static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count)
atomic64_add(count, &kvm->stat.pages[level - 1]);
}
-gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access,
- struct x86_exception *exception);
-
static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu,
struct kvm_mmu *mmu,
gpa_t gpa, u64 access,
- struct x86_exception *exception)
+ struct x86_exception *exception,
+ u64 pte_access)
{
if (mmu != &vcpu->arch.nested_mmu)
return gpa;
- return translate_nested_gpa(vcpu, gpa, access, exception);
+ return kvm_x86_ops.nested_ops->translate_nested_gpa(vcpu, gpa, access,
+ exception,
+ pte_access);
}
static inline bool kvm_has_mirrored_tdp(const struct kvm *kvm)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 892246204435..f8aa7eda661e 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -55,6 +55,7 @@
#include <asm/io.h>
#include <asm/set_memory.h>
#include <asm/spec-ctrl.h>
+#include <asm/svm.h>
#include <asm/vmx.h>
#include "trace.h"
@@ -229,13 +230,18 @@ static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \
}
BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse);
-BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep);
+BUILD_MMU_ROLE_ACCESSOR(base, cr4, smep);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57);
BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
BUILD_MMU_ROLE_ACCESSOR(ext, efer, lma);
+static inline bool has_pferr_fetch(struct kvm_mmu *mmu)
+{
+ return mmu->cpu_role.ext.has_pferr_fetch;
+}
+
static inline bool is_cr0_pg(struct kvm_mmu *mmu)
{
return mmu->cpu_role.base.level > 0;
@@ -2022,7 +2028,7 @@ static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
*/
const union kvm_mmu_page_role sync_role_ign = {
.level = 0xf,
- .access = 0x7,
+ .access = ACC_ALL,
.quadrant = 0x3,
.passthrough = 0x1,
};
@@ -3439,12 +3445,13 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
- int ret;
+ int ret, access;
gfn_t base_gfn = fault->gfn;
kvm_mmu_hugepage_adjust(vcpu, fault);
- trace_kvm_mmu_spte_requested(fault);
+ access = vcpu->arch.mmu->root_role.access;
+ trace_kvm_mmu_spte_requested(fault, access);
for_each_shadow_entry(vcpu, fault->addr, it) {
/*
* We cannot overwrite existing page tables with an NX
@@ -3457,7 +3464,7 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (it.level == fault->goal_level)
break;
- sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL);
+ sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, access);
if (sp == ERR_PTR(-EEXIST))
continue;
@@ -3470,7 +3477,7 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (WARN_ON_ONCE(it.level != fault->goal_level))
return -EFAULT;
- ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
+ ret = mmu_set_spte(vcpu, fault->slot, it.sptep, access,
base_gfn, fault->pfn, fault);
if (ret == RET_PF_SPURIOUS)
return ret;
@@ -4341,7 +4348,14 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
{
if (exception)
exception->error_code = 0;
- return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
+ /*
+ * EPT MBEC uses the effective access bits from the PTE to distinguish
+ * user and supervisor accesses, and treats every linear address as a
+ * user-mode address if CR0.PG=0. Therefore *include* ACC_USER_MASK in
+ * the last argument to kvm_translate_gpa (which NPT does not use).
+ */
+ return kvm_translate_gpa(vcpu, mmu, vaddr, access | PFERR_GUEST_FINAL_MASK,
+ exception, ACC_ALL);
}
static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
@@ -5477,7 +5491,7 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
static inline bool boot_cpu_is_amd(void)
{
WARN_ON_ONCE(!tdp_enabled);
- return shadow_x_mask == 0;
+ return shadow_xs_mask == 0;
}
/*
@@ -5522,55 +5536,106 @@ reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
max_huge_page_level);
}
-#define BYTE_MASK(access) \
- ((1 & (access) ? 2 : 0) | \
- (2 & (access) ? 4 : 0) | \
- (3 & (access) ? 8 : 0) | \
- (4 & (access) ? 16 : 0) | \
- (5 & (access) ? 32 : 0) | \
- (6 & (access) ? 64 : 0) | \
- (7 & (access) ? 128 : 0))
-
-
-static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
+/*
+ * Build a mask with all combinations of PTE access rights that
+ * include the given access bit. The mask can be queried with
+ * "mask & (1 << access)", where access is a combination of
+ * ACC_* bits.
+ *
+ * By mixing and matching multiple masks returned by ACC_BITS_MASK,
+ * update_permission_bitmask() builds what is effectively a
+ * two-dimensional array of bools. The second dimension is
+ * provided by individual bits of permissions[pfec >> 1], and
+ * logical &, | and ~ operations operate on all the 16 possible
+ * combinations of ACC_* bits.
+ */
+#define ACC_BITS_MASK(access) \
+ ((1 & (access) ? 1 << 1 : 0) | \
+ (2 & (access) ? 1 << 2 : 0) | \
+ (3 & (access) ? 1 << 3 : 0) | \
+ (4 & (access) ? 1 << 4 : 0) | \
+ (5 & (access) ? 1 << 5 : 0) | \
+ (6 & (access) ? 1 << 6 : 0) | \
+ (7 & (access) ? 1 << 7 : 0) | \
+ (8 & (access) ? 1 << 8 : 0) | \
+ (9 & (access) ? 1 << 9 : 0) | \
+ (10 & (access) ? 1 << 10 : 0) | \
+ (11 & (access) ? 1 << 11 : 0) | \
+ (12 & (access) ? 1 << 12 : 0) | \
+ (13 & (access) ? 1 << 13 : 0) | \
+ (14 & (access) ? 1 << 14 : 0) | \
+ (15 & (access) ? 1 << 15 : 0))
+
+static void update_permission_bitmask(struct kvm_mmu *mmu, bool tdp, bool ept)
{
- unsigned byte;
+ unsigned index;
- const u8 x = BYTE_MASK(ACC_EXEC_MASK);
- const u8 w = BYTE_MASK(ACC_WRITE_MASK);
- const u8 u = BYTE_MASK(ACC_USER_MASK);
+ const u16 w = ACC_BITS_MASK(ACC_WRITE_MASK);
+ const u16 r = ACC_BITS_MASK(ACC_READ_MASK);
bool cr4_smep = is_cr4_smep(mmu);
bool cr4_smap = is_cr4_smap(mmu);
bool cr0_wp = is_cr0_wp(mmu);
bool efer_nx = is_efer_nx(mmu);
- for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
- unsigned pfec = byte << 1;
+ /*
+ * In hardware, page fault error codes are generated (as the name
+ * suggests) on any kind of page fault. permission_fault() and
+ * paging_tmpl.h already use the same bits after a successful page
+ * table walk, to indicate the kind of access being performed.
+ *
+ * However, PFERR_PRESENT_MASK and PFERR_RSVD_MASK are never set here,
+ * exactly because the page walk is successful. PFERR_PRESENT_MASK is
+ * removed by the shift, while PFERR_RSVD_MASK is repurposed in
+ * permission_fault() to indicate accesses that are *not* subject to
+ * SMAP restrictions.
+ */
+ for (index = 0; index < ARRAY_SIZE(mmu->permissions); ++index) {
+ unsigned pfec = index << 1;
/*
- * Each "*f" variable has a 1 bit for each UWX value
+ * Each "*f" variable has a 1 bit for each ACC_* combo
* that causes a fault with the given PFEC.
*/
+ /* Faults from reads to non-readable pages */
+ u16 rf = (pfec & (PFERR_WRITE_MASK|PFERR_FETCH_MASK)) ? 0 : (u16)~r;
/* Faults from writes to non-writable pages */
- u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
+ u16 wf = (pfec & PFERR_WRITE_MASK) ? (u16)~w : 0;
/* Faults from user mode accesses to supervisor pages */
- u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
- /* Faults from fetches of non-executable pages*/
- u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
- /* Faults from kernel mode fetches of user pages */
- u8 smepf = 0;
+ u16 uf = 0;
+ /* Faults from fetches of non-executable pages */
+ u16 ff = 0;
/* Faults from kernel mode accesses of user pages */
- u8 smapf = 0;
+ u16 smapf = 0;
+
+ if (ept) {
+ const u16 xs = ACC_BITS_MASK(ACC_EXEC_MASK);
+ const u16 xu = ACC_BITS_MASK(ACC_USER_EXEC_MASK);
+
+ if (pfec & PFERR_FETCH_MASK) {
+ /* Ignore XU unless MBEC is enabled. */
+ if (cr4_smep)
+ ff = pfec & PFERR_USER_MASK ? (u16)~xu : (u16)~xs;
+ else
+ ff = (u16)~xs;
+ }
+ } else {
+ const u16 x = ACC_BITS_MASK(ACC_EXEC_MASK);
+ const u16 u = ACC_BITS_MASK(ACC_USER_MASK);
- if (!ept) {
/* Faults from kernel mode accesses to user pages */
- u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
+ u16 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
- /* Not really needed: !nx will cause pte.nx to fault */
- if (!efer_nx)
- ff = 0;
+ /*
+ * For NPT GMET, U=0 does not affect reads and writes. Fetches
+ * are handled below via cr4_smep.
+ */
+ if (!(tdp && cr4_smep))
+ uf = (pfec & PFERR_USER_MASK) ? (u16)~u : 0;
+
+ if (efer_nx)
+ ff |= (pfec & PFERR_FETCH_MASK) ? (u16)~x : 0;
/* Allow supervisor writes if !cr0.wp */
if (!cr0_wp)
@@ -5578,7 +5643,7 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
/* Disallow supervisor fetches of user code if cr4.smep */
if (cr4_smep)
- smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
+ ff |= (pfec & PFERR_FETCH_MASK) ? kf : 0;
/*
* SMAP:kernel-mode data accesses from user-mode
@@ -5591,16 +5656,15 @@ static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
* - The access is supervisor mode
* - If implicit supervisor access or X86_EFLAGS_AC is clear
*
- * Here, we cover the first four conditions.
- * The fifth is computed dynamically in permission_fault();
- * PFERR_RSVD_MASK bit will be set in PFEC if the access is
- * *not* subject to SMAP restrictions.
+ * Here, we cover the first four conditions. The fifth
+ * is computed dynamically in permission_fault() and
+ * communicated by setting PFERR_RSVD_MASK.
*/
if (cr4_smap)
smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
}
- mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
+ mmu->permissions[index] = ff | uf | wf | rf | smapf;
}
}
@@ -5679,7 +5743,7 @@ static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
return;
reset_guest_rsvds_bits_mask(vcpu, mmu);
- update_permission_bitmask(mmu, false);
+ update_permission_bitmask(mmu, mmu == &vcpu->arch.guest_mmu, false);
update_pkru_bitmask(mmu);
}
@@ -5714,7 +5778,7 @@ static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
role.base.efer_nx = ____is_efer_nx(regs);
role.base.cr0_wp = ____is_cr0_wp(regs);
- role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
+ role.base.cr4_smep = ____is_cr4_smep(regs);
role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
@@ -5726,7 +5790,6 @@ static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
else
role.base.level = PT32_ROOT_LEVEL;
- role.ext.cr4_smep = ____is_cr4_smep(regs);
role.ext.cr4_smap = ____is_cr4_smap(regs);
role.ext.cr4_pse = ____is_cr4_pse(regs);
@@ -5734,6 +5797,8 @@ static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
role.ext.efer_lma = ____is_efer_lma(regs);
+
+ role.ext.has_pferr_fetch = role.base.efer_nx | role.base.cr4_smep;
return role;
}
@@ -5783,8 +5848,8 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
{
union kvm_mmu_page_role role = {0};
- role.access = ACC_ALL;
role.cr0_wp = true;
+ role.cr4_smep = kvm_x86_call(tdp_has_smep)(vcpu->kvm);
role.efer_nx = true;
role.smm = cpu_role.base.smm;
role.guest_mode = cpu_role.base.guest_mode;
@@ -5793,6 +5858,11 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
role.direct = true;
role.has_4_byte_gpte = false;
+ /* All TDP pages are supervisor-executable */
+ role.access = ACC_ALL;
+ if (role.cr4_smep && shadow_user_mask)
+ role.access &= ~ACC_USER_MASK;
+
return role;
}
@@ -5872,13 +5942,13 @@ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
}
-void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
- unsigned long cr4, u64 efer, gpa_t nested_cr3)
+void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr4,
+ u64 efer, gpa_t nested_cr3, u64 misc_ctl)
{
struct kvm_mmu *context = &vcpu->arch.guest_mmu;
struct kvm_mmu_role_regs regs = {
- .cr0 = cr0,
- .cr4 = cr4 & ~X86_CR4_PKE,
+ .cr0 = X86_CR0_PG | X86_CR0_WP,
+ .cr4 = cr4 & ~(X86_CR4_PKE | X86_CR4_SMAP),
.efer = efer,
};
union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
@@ -5886,6 +5956,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
/* NPT requires CR0.PG=1. */
WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode);
+ cpu_role.base.cr4_smep = (misc_ctl & SVM_MISC_ENABLE_GMET) != 0;
root_role = cpu_role.base;
root_role.level = kvm_mmu_get_tdp_level(vcpu);
@@ -5900,7 +5971,7 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_shadow_npt_mmu);
static union kvm_cpu_role
kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
- bool execonly, u8 level)
+ bool execonly, u8 level, bool mbec)
{
union kvm_cpu_role role = {0};
@@ -5910,6 +5981,7 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
*/
WARN_ON_ONCE(is_smm(vcpu));
role.base.level = level;
+ role.base.cr4_smep = mbec;
role.base.has_4_byte_gpte = false;
role.base.direct = false;
role.base.ad_disabled = !accessed_dirty;
@@ -5925,13 +5997,13 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
int huge_page_level, bool accessed_dirty,
- gpa_t new_eptp)
+ bool mbec, gpa_t new_eptp)
{
struct kvm_mmu *context = &vcpu->arch.guest_mmu;
u8 level = vmx_eptp_page_walk_level(new_eptp);
union kvm_cpu_role new_mode =
kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
- execonly, level);
+ execonly, level, mbec);
if (new_mode.as_u64 != context->cpu_role.as_u64) {
/* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
@@ -5942,7 +6014,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->gva_to_gpa = ept_gva_to_gpa;
context->sync_spte = ept_sync_spte;
- update_permission_bitmask(context, true);
+ update_permission_bitmask(context, true, true);
context->pkru_mask = 0;
reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
reset_ept_shadow_zero_bits_mask(context, execonly);
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index 764e3015d021..fa01719baf8d 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -25,7 +25,8 @@
#define KVM_MMU_PAGE_PRINTK() ({ \
const char *saved_ptr = trace_seq_buffer_ptr(p); \
static const char *access_str[] = { \
- "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \
+ "----", "r---", "-w--", "rw--", "--u-", "r-u-", "-wu-", "rwu-", \
+ "---x", "r--x", "-w-x", "rw-x", "--ux", "r-ux", "-wux", "rwux" \
}; \
union kvm_mmu_page_role role; \
\
@@ -356,8 +357,8 @@ TRACE_EVENT(
__entry->sptep = virt_to_phys(sptep);
__entry->level = level;
__entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
- __entry->x = is_executable_pte(__entry->spte);
- __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
+ __entry->x = (__entry->spte & (shadow_xs_mask | shadow_nx_mask)) == shadow_xs_mask;
+ __entry->u = !!(__entry->spte & (shadow_xu_mask | shadow_user_mask));
),
TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
@@ -365,30 +366,32 @@ TRACE_EVENT(
__entry->r ? "r" : "-",
__entry->spte & PT_WRITABLE_MASK ? "w" : "-",
__entry->x ? "x" : "-",
- __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
+ __entry->u ? "u" : "-",
__entry->level, __entry->sptep
)
);
TRACE_EVENT(
kvm_mmu_spte_requested,
- TP_PROTO(struct kvm_page_fault *fault),
- TP_ARGS(fault),
+ TP_PROTO(struct kvm_page_fault *fault, u8 access),
+ TP_ARGS(fault, access),
TP_STRUCT__entry(
__field(u64, gfn)
__field(u64, pfn)
__field(u8, level)
+ __field(u8, access)
),
TP_fast_assign(
__entry->gfn = fault->gfn;
__entry->pfn = fault->pfn | (fault->gfn & (KVM_PAGES_PER_HPAGE(fault->goal_level) - 1));
__entry->level = fault->goal_level;
+ __entry->access = access;
),
- TP_printk("gfn %llx pfn %llx level %d",
- __entry->gfn, __entry->pfn, __entry->level
+ TP_printk("gfn %llx pfn %llx level %d access %x",
+ __entry->gfn, __entry->pfn, __entry->level, __entry->access
)
);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 901cd2bd40b8..07100bbfc270 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -124,12 +124,17 @@ static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *acce
*access &= mask;
}
-static inline int FNAME(is_present_gpte)(unsigned long pte)
+static inline int FNAME(is_present_gpte)(struct kvm_mmu *mmu,
+ unsigned long pte)
{
#if PTTYPE != PTTYPE_EPT
return pte & PT_PRESENT_MASK;
#else
- return pte & 7;
+ /*
+ * For EPT, an entry is present if any of bits 2:0 are set.
+ * With mode-based execute control, bit 10 also indicates presence.
+ */
+ return pte & (7 | (mmu_has_mbec(mmu) ? VMX_EPT_USER_EXECUTABLE_MASK : 0));
#endif
}
@@ -152,7 +157,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *spte,
u64 gpte)
{
- if (!FNAME(is_present_gpte)(gpte))
+ if (!FNAME(is_present_gpte)(vcpu->arch.mmu, gpte))
goto no_present;
/* Prefetch only accessed entries (unless A/D bits are disabled). */
@@ -170,25 +175,31 @@ no_present:
return true;
}
-/*
- * For PTTYPE_EPT, a page table can be executable but not readable
- * on supported processors. Therefore, set_spte does not automatically
- * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK
- * to signify readability since it isn't used in the EPT case
- */
static inline unsigned FNAME(gpte_access)(u64 gpte)
{
unsigned access;
+ /*
+ * Set bits in ACC_*_MASK even if they might not be used in the
+ * actual checks. For example, if EFER.NX is clear permission_fault()
+ * will ignore ACC_EXEC_MASK, and if MBEC is disabled it will
+ * ignore ACC_USER_EXEC_MASK.
+ */
#if PTTYPE == PTTYPE_EPT
access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
- ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0);
+ ((gpte & VMX_EPT_READABLE_MASK) ? ACC_READ_MASK : 0) |
+ ((gpte & VMX_EPT_USER_EXECUTABLE_MASK) ? ACC_USER_EXEC_MASK : 0);
#else
- BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
- BUILD_BUG_ON(ACC_EXEC_MASK != 1);
+ /*
+ * P is set here, so the page is always readable and W/U/!NX represent
+ * allowed accesses.
+ */
+ BUILD_BUG_ON(ACC_READ_MASK != PT_PRESENT_MASK);
+ BUILD_BUG_ON(ACC_WRITE_MASK != PT_WRITABLE_MASK);
+ BUILD_BUG_ON(ACC_USER_MASK != PT_USER_MASK);
+ BUILD_BUG_ON(ACC_EXEC_MASK & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK));
access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK);
- /* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */
- access ^= (gpte >> PT64_NX_SHIFT);
+ access |= gpte & PT64_NX_MASK ? 0 : ACC_EXEC_MASK;
#endif
return access;
@@ -332,7 +343,7 @@ retry_walk:
if (walker->level == PT32E_ROOT_LEVEL) {
pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
trace_kvm_mmu_paging_element(pte, walker->level);
- if (!FNAME(is_present_gpte)(pte))
+ if (!FNAME(is_present_gpte)(mmu, pte))
goto error;
--walker->level;
}
@@ -377,7 +388,8 @@ retry_walk:
walker->pte_gpa[walker->level - 1] = pte_gpa;
real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn),
- nested_access, &walker->fault);
+ nested_access | PFERR_GUEST_PAGE_MASK,
+ &walker->fault, 0);
/*
* FIXME: This can happen if emulation (for of an INS/OUTS
@@ -414,7 +426,7 @@ retry_walk:
*/
pte_access = pt_access & (pte ^ walk_nx_mask);
- if (unlikely(!FNAME(is_present_gpte)(pte)))
+ if (unlikely(!FNAME(is_present_gpte)(mmu, pte)))
goto error;
if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) {
@@ -445,7 +457,9 @@ retry_walk:
gfn += pse36_gfn_delta(pte);
#endif
- real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault);
+ real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn),
+ access | PFERR_GUEST_FINAL_MASK,
+ &walker->fault, walker->pte_access);
if (real_gpa == INVALID_GPA)
return 0;
@@ -475,7 +489,7 @@ retry_walk:
error:
errcode |= write_fault | user_fault;
- if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu)))
+ if (fetch_fault && has_pferr_fetch(mmu))
errcode |= PFERR_FETCH_MASK;
walker->fault.vector = PF_VECTOR;
@@ -492,7 +506,7 @@ error:
* [2:0] - Derive from the access bits. The exit_qualification might be
* out of date if it is serving an EPT misconfiguration.
* [5:3] - Calculated by the page walk of the guest EPT page tables
- * [7:8] - Derived from [7:8] of real exit_qualification
+ * [7:11] - Derived from [7:11] of real exit_qualification
*
* The other bits are set to 0.
*/
@@ -501,16 +515,27 @@ error:
if (write_fault)
walker->fault.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
- if (user_fault)
- walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ;
- if (fetch_fault)
+ else if (fetch_fault)
walker->fault.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
+ else
+ walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ;
+
+ /*
+ * Accesses to guest paging structures are either "reads" or
+ * "read+write" accesses, so consider them the latter if write_fault
+ * is true.
+ */
+ if (access & PFERR_GUEST_PAGE_MASK)
+ walker->fault.exit_qualification |= EPT_VIOLATION_ACC_READ;
/*
* Note, pte_access holds the raw RWX bits from the EPTE, not
* ACC_*_MASK flags!
*/
walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access);
+ if (mmu_has_mbec(mmu))
+ walker->fault.exit_qualification |=
+ EPT_VIOLATION_USER_EXEC_TO_PROT(pte_access);
}
#endif
walker->fault.address = addr;
@@ -709,7 +734,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
*/
kvm_mmu_hugepage_adjust(vcpu, fault);
- trace_kvm_mmu_spte_requested(fault);
+ trace_kvm_mmu_spte_requested(fault, gw->pte_access);
for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
/*
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 85a0473809b0..d2f5f7dd8fe1 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -29,8 +29,9 @@ bool __read_mostly kvm_ad_enabled;
u64 __read_mostly shadow_host_writable_mask;
u64 __read_mostly shadow_mmu_writable_mask;
u64 __read_mostly shadow_nx_mask;
-u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
u64 __read_mostly shadow_user_mask;
+u64 __read_mostly shadow_xs_mask; /* mutual exclusive with nx_mask and user_mask */
+u64 __read_mostly shadow_xu_mask; /* mutual exclusive with nx_mask and user_mask */
u64 __read_mostly shadow_accessed_mask;
u64 __read_mostly shadow_dirty_mask;
u64 __read_mostly shadow_mmio_value;
@@ -194,12 +195,6 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
int is_host_mmio = -1;
bool wrprot = false;
- /*
- * For the EPT case, shadow_present_mask has no RWX bits set if
- * exec-only page table entries are supported. In that case,
- * ACC_USER_MASK and shadow_user_mask are used to represent
- * read access. See FNAME(gpte_access) in paging_tmpl.h.
- */
WARN_ON_ONCE((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE);
if (sp->role.ad_disabled)
@@ -223,18 +218,26 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
* would tie make_spte() further to vCPU/MMU state, and add complexity
* just to optimize a mode that is anything but performance critical.
*/
- if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
- is_nx_huge_page_enabled(vcpu->kvm)) {
+ if (level > PG_LEVEL_4K && is_nx_huge_page_enabled(vcpu->kvm)) {
pte_access &= ~ACC_EXEC_MASK;
+ if (shadow_xu_mask)
+ pte_access &= ~ACC_USER_EXEC_MASK;
}
- if (pte_access & ACC_EXEC_MASK)
- spte |= shadow_x_mask;
- else
- spte |= shadow_nx_mask;
-
- if (pte_access & ACC_USER_MASK)
- spte |= shadow_user_mask;
+ if (pte_access & ACC_READ_MASK)
+ spte |= PT_PRESENT_MASK; /* or VMX_EPT_READABLE_MASK */
+
+ if (shadow_nx_mask) {
+ if (!(pte_access & ACC_EXEC_MASK))
+ spte |= shadow_nx_mask;
+ if (pte_access & ACC_USER_MASK)
+ spte |= shadow_user_mask;
+ } else {
+ if (pte_access & ACC_EXEC_MASK)
+ spte |= shadow_xs_mask;
+ if (pte_access & ACC_USER_EXEC_MASK)
+ spte |= shadow_xu_mask;
+ }
if (level > PG_LEVEL_4K)
spte |= PT_PAGE_SIZE_MASK;
@@ -317,14 +320,18 @@ static u64 modify_spte_protections(u64 spte, u64 set, u64 clear)
return spte;
}
-static u64 make_spte_executable(u64 spte)
+static u64 change_spte_executable(u64 spte, u8 access)
{
- return modify_spte_protections(spte, shadow_x_mask, shadow_nx_mask);
-}
+ u64 set, clear;
-static u64 make_spte_nonexecutable(u64 spte)
-{
- return modify_spte_protections(spte, shadow_nx_mask, shadow_x_mask);
+ if (shadow_nx_mask)
+ set = (access & ACC_EXEC_MASK) ? 0 : shadow_nx_mask;
+ else
+ set =
+ (access & ACC_EXEC_MASK ? shadow_xs_mask : 0) |
+ (access & ACC_USER_EXEC_MASK ? shadow_xu_mask : 0);
+ clear = set ^ (shadow_nx_mask | shadow_xs_mask | shadow_xu_mask);
+ return modify_spte_protections(spte, set, clear);
}
/*
@@ -356,8 +363,8 @@ u64 make_small_spte(struct kvm *kvm, u64 huge_spte,
* the page executable as the NX hugepage mitigation no longer
* applies.
*/
- if ((role.access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(kvm))
- child_spte = make_spte_executable(child_spte);
+ if (is_nx_huge_page_enabled(kvm))
+ child_spte = change_spte_executable(child_spte, role.access);
}
return child_spte;
@@ -379,7 +386,7 @@ u64 make_huge_spte(struct kvm *kvm, u64 small_spte, int level)
huge_spte &= KVM_HPAGE_MASK(level) | ~PAGE_MASK;
if (is_nx_huge_page_enabled(kvm))
- huge_spte = make_spte_nonexecutable(huge_spte);
+ huge_spte = change_spte_executable(huge_spte, 0);
return huge_spte;
}
@@ -389,7 +396,8 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
u64 spte = SPTE_MMU_PRESENT_MASK;
spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
- shadow_user_mask | shadow_x_mask | shadow_me_value;
+ PT_PRESENT_MASK /* or VMX_EPT_READABLE_MASK */ |
+ shadow_user_mask | shadow_xs_mask | shadow_xu_mask | shadow_me_value;
if (ad_disabled)
spte |= SPTE_TDP_AD_DISABLED;
@@ -489,20 +497,37 @@ void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask)
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_me_spte_mask);
-void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
+void kvm_mmu_set_ept_masks(bool has_ad_bits)
{
kvm_ad_enabled = has_ad_bits;
- shadow_user_mask = VMX_EPT_READABLE_MASK;
+ shadow_user_mask = 0;
shadow_accessed_mask = VMX_EPT_ACCESS_BIT;
shadow_dirty_mask = VMX_EPT_DIRTY_BIT;
shadow_nx_mask = 0ull;
- shadow_x_mask = VMX_EPT_EXECUTABLE_MASK;
- /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */
- shadow_present_mask =
- (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT;
+ shadow_xs_mask = VMX_EPT_EXECUTABLE_MASK;
+
+ /*
+ * The MMU always maps ACC_EXEC_MASK and ACC_USER_EXEC_MASK to the
+ * XS and XU bits of shadow EPT entries, regardless of whether MBEC
+ * is available on the host or enabled in the VMCS.
+ *
+ * For the non-nested case, pages are mapped with ACC_EXEC_MASK
+ * and ACC_USER_EXEC_MASK set in tandem, so XS == XU and the
+ * host's MBEC setting does not matter. On hardware without MBEC
+ * the XU bit is reserved-as-ignored, and setting it does no harm.
+ *
+ * For nested EPT, when MBEC is disabled by L1, correctness relies
+ * on (a) ignoring bit 10 of the gPTE in is_present_gpte(), rather
+ * than treating it as a present bit, and (b) permission_fault()
+ * using an mmu->permissions[] array that effectively ignores
+ * ACC_USER_EXEC_MASK. Bit 10 of the gPTE does end up mirrored
+ * in the sPTEs but is ignored because L2 runs with MBEC disabled.
+ */
+ shadow_xu_mask = VMX_EPT_USER_EXECUTABLE_MASK;
+ shadow_present_mask = VMX_EPT_SUPPRESS_VE_BIT;
- shadow_acc_track_mask = VMX_EPT_RWX_MASK;
+ shadow_acc_track_mask = VMX_EPT_RWX_MASK | VMX_EPT_USER_EXECUTABLE_MASK;
shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE;
@@ -550,7 +575,8 @@ void kvm_mmu_reset_all_pte_masks(void)
shadow_accessed_mask = PT_ACCESSED_MASK;
shadow_dirty_mask = PT_DIRTY_MASK;
shadow_nx_mask = PT64_NX_MASK;
- shadow_x_mask = 0;
+ shadow_xs_mask = 0;
+ shadow_xu_mask = 0;
shadow_present_mask = PT_PRESENT_MASK;
shadow_acc_track_mask = 0;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 8c0ffa2cded6..13eea94dd212 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -18,9 +18,19 @@
#define SPTE_MMU_PRESENT_MASK BIT_ULL(11)
/*
+ * The ignored high bits are allocated as follows:
+ * - bits 52, 54: saved X-R bits for access tracking when EPT does not have A/D
+ * - bits 53 (EPT only): host writable
+ * - bits 55 (EPT only): MMU-writable
+ * - bits 56-59: unused
+ * - bits 60-61: type of A/D tracking
+ * - bits 62 (EPT only): saved XU bit for disabled AD
+ */
+
+/*
* TDP SPTES (more specifically, EPT SPTEs) may not have A/D bits, and may also
* be restricted to using write-protection (for L2 when CPU dirty logging, i.e.
- * PML, is enabled). Use bits 52 and 53 to hold the type of A/D tracking that
+ * PML, is enabled). Use bits 60 and 61 to hold the type of A/D tracking that
* is must be employed for a given TDP SPTE.
*
* Note, the "enabled" mask must be '0', as bits 62:52 are _reserved_ for PAE
@@ -29,7 +39,7 @@
* TDP with CPU dirty logging (PML). If NPT ever gains PML-like support, it
* must be restricted to 64-bit KVM.
*/
-#define SPTE_TDP_AD_SHIFT 52
+#define SPTE_TDP_AD_SHIFT 60
#define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT)
#define SPTE_TDP_AD_ENABLED (0ULL << SPTE_TDP_AD_SHIFT)
#define SPTE_TDP_AD_DISABLED (1ULL << SPTE_TDP_AD_SHIFT)
@@ -42,18 +52,6 @@ static_assert(SPTE_TDP_AD_ENABLED == 0);
#define SPTE_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
#endif
-#define SPTE_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
- | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
-
-#define ACC_EXEC_MASK 1
-#define ACC_WRITE_MASK PT_WRITABLE_MASK
-#define ACC_USER_MASK PT_USER_MASK
-#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
-
-/* The mask for the R/X bits in EPT PTEs */
-#define SPTE_EPT_READABLE_MASK 0x1ull
-#define SPTE_EPT_EXECUTABLE_MASK 0x4ull
-
#define SPTE_LEVEL_BITS 9
#define SPTE_LEVEL_SHIFT(level) __PT_LEVEL_SHIFT(level, SPTE_LEVEL_BITS)
#define SPTE_INDEX(address, level) __PT_INDEX(address, level, SPTE_LEVEL_BITS)
@@ -66,9 +64,10 @@ static_assert(SPTE_TDP_AD_ENABLED == 0);
* restored only when a write is attempted to the page. This mask obviously
* must not overlap the A/D type mask.
*/
-#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (SPTE_EPT_READABLE_MASK | \
- SPTE_EPT_EXECUTABLE_MASK)
-#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 54
+#define SHADOW_ACC_TRACK_SAVED_BITS_MASK (VMX_EPT_READABLE_MASK | \
+ VMX_EPT_EXECUTABLE_MASK | \
+ VMX_EPT_USER_EXECUTABLE_MASK)
+#define SHADOW_ACC_TRACK_SAVED_BITS_SHIFT 52
#define SHADOW_ACC_TRACK_SAVED_MASK (SHADOW_ACC_TRACK_SAVED_BITS_MASK << \
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT)
static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
@@ -87,8 +86,8 @@ static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
* to not overlap the A/D type mask or the saved access bits of access-tracked
* SPTEs when A/D bits are disabled.
*/
-#define EPT_SPTE_HOST_WRITABLE BIT_ULL(57)
-#define EPT_SPTE_MMU_WRITABLE BIT_ULL(58)
+#define EPT_SPTE_HOST_WRITABLE BIT_ULL(53)
+#define EPT_SPTE_MMU_WRITABLE BIT_ULL(55)
static_assert(!(EPT_SPTE_HOST_WRITABLE & SPTE_TDP_AD_MASK));
static_assert(!(EPT_SPTE_MMU_WRITABLE & SPTE_TDP_AD_MASK));
@@ -99,11 +98,11 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
#undef SHADOW_ACC_TRACK_SAVED_MASK
/*
- * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
+ * Due to limited space in PTEs, the MMIO generation is an 18 bit subset of
* the memslots generation and is derived as follows:
*
- * Bits 0-7 of the MMIO generation are propagated to spte bits 3-10
- * Bits 8-18 of the MMIO generation are propagated to spte bits 52-62
+ * Bits 0-6 of the MMIO generation are propagated to spte bits 3-9
+ * Bits 7-17 of the MMIO generation are propagated to spte bits 52-62
*
* The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
* the MMIO generation number, as doing so would require stealing a bit from
@@ -114,7 +113,7 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK));
*/
#define MMIO_SPTE_GEN_LOW_START 3
-#define MMIO_SPTE_GEN_LOW_END 10
+#define MMIO_SPTE_GEN_LOW_END 9
#define MMIO_SPTE_GEN_HIGH_START 52
#define MMIO_SPTE_GEN_HIGH_END 62
@@ -136,7 +135,8 @@ static_assert(!(SPTE_MMU_PRESENT_MASK &
* and so they're off-limits for generation; additional checks ensure the mask
* doesn't overlap legal PA bits), and bit 63 (carved out for future usage).
*/
-#define SPTE_MMIO_ALLOWED_MASK (BIT_ULL(63) | GENMASK_ULL(51, 12) | GENMASK_ULL(2, 0))
+#define SPTE_MMIO_ALLOWED_MASK (BIT_ULL(63) | GENMASK_ULL(51, 12) | \
+ BIT_ULL(10) | GENMASK_ULL(2, 0))
static_assert(!(SPTE_MMIO_ALLOWED_MASK &
(SPTE_MMU_PRESENT_MASK | MMIO_SPTE_GEN_LOW_MASK | MMIO_SPTE_GEN_HIGH_MASK)));
@@ -144,7 +144,7 @@ static_assert(!(SPTE_MMIO_ALLOWED_MASK &
#define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)
/* remember to adjust the comment above as well if you change these */
-static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
+static_assert(MMIO_SPTE_GEN_LOW_BITS == 7 && MMIO_SPTE_GEN_HIGH_BITS == 11);
#define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0)
#define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)
@@ -179,8 +179,9 @@ extern bool __read_mostly kvm_ad_enabled;
extern u64 __read_mostly shadow_host_writable_mask;
extern u64 __read_mostly shadow_mmu_writable_mask;
extern u64 __read_mostly shadow_nx_mask;
-extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
extern u64 __read_mostly shadow_user_mask;
+extern u64 __read_mostly shadow_xs_mask; /* mutual exclusive with nx_mask and user_mask */
+extern u64 __read_mostly shadow_xu_mask; /* mutual exclusive with nx_mask and user_mask */
extern u64 __read_mostly shadow_accessed_mask;
extern u64 __read_mostly shadow_dirty_mask;
extern u64 __read_mostly shadow_mmio_value;
@@ -220,10 +221,11 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
*
* Only used by the TDP MMU.
*/
-#define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL)
+#define FROZEN_SPTE (SHADOW_NONPRESENT_VALUE | 0x1a0ULL)
-/* Frozen SPTEs must not be misconstrued as shadow present PTEs. */
-static_assert(!(FROZEN_SPTE & SPTE_MMU_PRESENT_MASK));
+/* Frozen SPTEs must not be misconstrued as shadow or MMU present PTEs. */
+static_assert(!(FROZEN_SPTE & (SPTE_MMU_PRESENT_MASK |
+ VMX_EPT_RWX_MASK | VMX_EPT_USER_EXECUTABLE_MASK)));
static inline bool is_frozen_spte(u64 spte)
{
@@ -357,7 +359,13 @@ static inline bool is_last_spte(u64 pte, int level)
static inline bool is_executable_pte(u64 spte)
{
- return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
+ /*
+ * For now, return true if either the XS or XU bit is set
+ * This function is only used for fast_page_fault,
+ * which never processes shadow EPT, and regular page
+ * tables always have XS==XU.
+ */
+ return (spte & (shadow_xs_mask | shadow_xu_mask | shadow_nx_mask)) != shadow_nx_mask;
}
static inline kvm_pfn_t spte_to_pfn(u64 pte)
@@ -387,6 +395,8 @@ static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check,
static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check,
u64 pte)
{
+ if (pte & VMX_EPT_USER_EXECUTABLE_MASK)
+ pte |= VMX_EPT_EXECUTABLE_MASK;
return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 7b1102d26f9c..5a2f8ce9a32b 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1185,9 +1185,9 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
}
if (unlikely(!fault->slot))
- new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
+ new_spte = make_mmio_spte(vcpu, iter->gfn, sp->role.access);
else
- wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
+ wrprot = make_spte(vcpu, sp, fault->slot, sp->role.access, iter->gfn,
fault->pfn, iter->old_spte, fault->prefetch,
false, fault->map_writable, &new_spte);
@@ -1272,7 +1272,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
kvm_mmu_hugepage_adjust(vcpu, fault);
- trace_kvm_mmu_spte_requested(fault);
+ trace_kvm_mmu_spte_requested(fault, root->role.access);
rcu_read_lock();
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 961804df5f45..3d1fd1776e19 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -93,9 +93,10 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
* when called via KVM_SET_NESTED_STATE, that state may _not_ match current
* vCPU state. CR0.WP is explicitly ignored, while CR0.PG is required.
*/
- kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
+ kvm_init_shadow_npt_mmu(vcpu, svm->vmcb01.ptr->save.cr4,
svm->vmcb01.ptr->save.efer,
- svm->nested.ctl.nested_cr3);
+ svm->nested.ctl.nested_cr3,
+ svm->nested.ctl.misc_ctl);
vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
@@ -488,11 +489,14 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
nested_svm_sanitize_intercept(vcpu, to, SKINIT);
nested_svm_sanitize_intercept(vcpu, to, RDPRU);
- /* Always clear SVM_MISC_ENABLE_NP if the guest cannot use NPTs */
+ /* Always clear misc_ctl bits that the guest cannot use */
to->misc_ctl = from->misc_ctl;
if (!guest_cpu_cap_has(vcpu, X86_FEATURE_NPT))
to->misc_ctl &= ~SVM_MISC_ENABLE_NP;
+ if (!gmet_enabled || !guest_cpu_cap_has(vcpu, X86_FEATURE_GMET))
+ to->misc_ctl &= ~SVM_MISC_ENABLE_GMET;
+
to->iopm_base_pa = from->iopm_base_pa & PAGE_MASK;
to->msrpm_base_pa = from->msrpm_base_pa & PAGE_MASK;
to->tsc_offset = from->tsc_offset;
@@ -857,7 +861,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
* the latter, L1 runs L2 with shadow page tables that translate L2 GVAs
* to L1 GPAs, so the same NPTs can be used for L1 and L2.
*/
- vmcb02->control.misc_ctl = vmcb01->control.misc_ctl & SVM_MISC_ENABLE_NP;
+ vmcb02->control.misc_ctl = vmcb01->control.misc_ctl & (SVM_MISC_ENABLE_NP | SVM_MISC_ENABLE_GMET);
vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP);
@@ -894,9 +898,13 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
/* Also overwritten later if necessary. */
vmcb02->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
- /* nested_cr3. */
- if (nested_npt_enabled(svm))
+ /* Use vmcb01 MMU and format if guest does not use nNPT */
+ if (nested_npt_enabled(svm)) {
+ vmcb02->control.misc_ctl &= ~SVM_MISC_ENABLE_GMET;
+ vmcb02->control.misc_ctl |= (svm->nested.ctl.misc_ctl & SVM_MISC_ENABLE_GMET);
+
nested_svm_init_mmu_context(vcpu);
+ }
vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(vcpu->arch.l1_tsc_offset,
vmcb12_ctrl->tsc_offset,
@@ -2071,8 +2079,26 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
return true;
}
+static gpa_t svm_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa,
+ u64 access,
+ struct x86_exception *exception,
+ u64 pte_access)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+
+ BUG_ON(!mmu_is_nested(vcpu));
+
+ /* Non-GMET walks are always user-walks */
+ if (!(svm->nested.ctl.misc_ctl & SVM_MISC_ENABLE_GMET))
+ access |= PFERR_USER_MASK;
+
+ return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
+}
+
struct kvm_x86_nested_ops svm_nested_ops = {
.leave_nested = svm_leave_nested,
+ .translate_nested_gpa = svm_translate_nested_gpa,
.is_exception_vmexit = nested_svm_is_exception_vmexit,
.check_events = svm_check_nested_events,
.triple_fault = nested_svm_triple_fault,
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e7fdd7a9c280..a82471a6d3ea 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -138,6 +138,9 @@ module_param(pause_filter_count_max, ushort, 0444);
bool __ro_after_init npt_enabled = true;
module_param_named(npt, npt_enabled, bool, 0444);
+bool gmet_enabled = true;
+module_param_named(gmet, gmet_enabled, bool, 0444);
+
/* allow nested virtualization in KVM/SVM */
static int __ro_after_init nested = true;
module_param(nested, int, 0444);
@@ -1209,6 +1212,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event)
save->g_pat = vcpu->arch.pat;
save->cr3 = 0;
}
+
+ if (gmet_enabled)
+ control->misc_ctl |= SVM_MISC_ENABLE_GMET;
+
svm->current_vmcb->asid_generation = 0;
svm->asid = 0;
@@ -1986,6 +1993,18 @@ static int npf_interception(struct kvm_vcpu *vcpu)
}
}
+ if (!is_sev_es_guest(vcpu) &&
+ (svm->vmcb->control.misc_ctl & SVM_MISC_ENABLE_GMET) &&
+ (error_code & PFERR_FETCH_MASK)) {
+ /*
+ * Work around errata 1218: EXITINFO1[2] May Be Incorrectly Set
+ * When GMET (Guest Mode Execute Trap extension) is Enabled
+ */
+ error_code |= PFERR_USER_MASK;
+ if (svm_get_cpl(vcpu) != 3)
+ error_code &= ~PFERR_USER_MASK;
+ }
+
if (is_sev_snp_guest(vcpu) && (error_code & PFERR_GUEST_ENC_MASK))
error_code |= PFERR_PRIVATE_ACCESS;
@@ -4612,6 +4631,11 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xd9;
}
+static bool svm_tdp_has_smep(struct kvm *kvm)
+{
+ return gmet_enabled;
+}
+
/*
* The kvm parameter can be NULL (module initialization, or invocation before
* VM creation). Be sure to check the kvm parameter before using it.
@@ -5355,6 +5379,7 @@ struct kvm_x86_ops svm_x86_ops __initdata = {
.write_tsc_multiplier = svm_write_tsc_multiplier,
.load_mmu_pgd = svm_load_mmu_pgd,
+ .tdp_has_smep = svm_tdp_has_smep,
.check_intercept = svm_check_intercept,
.handle_exit_irqoff = svm_handle_exit_irqoff,
@@ -5479,6 +5504,9 @@ static __init void svm_set_cpu_caps(void)
if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
+ if (gmet_enabled)
+ kvm_cpu_cap_set(X86_FEATURE_GMET);
+
if (vgif)
kvm_cpu_cap_set(X86_FEATURE_VGIF);
@@ -5588,6 +5616,9 @@ static __init int svm_hardware_setup(void)
if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
+ if (!npt_enabled || !boot_cpu_has(X86_FEATURE_GMET))
+ gmet_enabled = false;
+
/* Force VM NPT level equal to the host's paging level */
kvm_configure_mmu(npt_enabled, get_npt_level(),
get_npt_level(), PG_LEVEL_1G);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index a10668d17a16..dd93b3daefa9 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -44,6 +44,7 @@ static inline struct page *__sme_pa_to_page(unsigned long pa)
#define IOPM_SIZE PAGE_SIZE * 3
#define MSRPM_SIZE PAGE_SIZE * 2
+extern bool gmet_enabled;
extern bool npt_enabled;
extern int nrips;
extern int vgif;
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 56cacc06225e..07469d1cfe74 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -15,6 +15,7 @@ extern bool __read_mostly enable_ept;
extern bool __read_mostly enable_unrestricted_guest;
extern bool __read_mostly enable_ept_ad_bits;
extern bool __read_mostly enable_pml;
+extern bool __read_mostly enable_mbec;
extern int __read_mostly pt_mode;
#define PT_MODE_SYSTEM 0
@@ -300,11 +301,6 @@ static inline bool cpu_has_vmx_flexpriority(void)
cpu_has_vmx_virtualize_apic_accesses();
}
-static inline bool cpu_has_vmx_ept_execute_only(void)
-{
- return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
-}
-
static inline bool cpu_has_vmx_ept_4levels(void)
{
return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
@@ -411,4 +407,10 @@ static inline bool cpu_has_notify_vmexit(void)
SECONDARY_EXEC_NOTIFY_VM_EXITING;
}
+static inline bool cpu_has_ept_mbec(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
+}
+
#endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h
index 412d0829d7a2..08005676702c 100644
--- a/arch/x86/kvm/vmx/common.h
+++ b/arch/x86/kvm/vmx/common.h
@@ -85,22 +85,30 @@ static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa,
{
u64 error_code;
- /* Is it a read fault? */
- error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
- ? PFERR_USER_MASK : 0;
/* Is it a write fault? */
- error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
+ error_code = (exit_qualification & EPT_VIOLATION_ACC_WRITE)
? PFERR_WRITE_MASK : 0;
/* Is it a fetch fault? */
error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
? PFERR_FETCH_MASK : 0;
- /* ept page table entry is present? */
- error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK)
+ /* ept page table entry is present? */
+ error_code |= (exit_qualification &
+ (EPT_VIOLATION_PROT_MASK & ~EPT_VIOLATION_PROT_USER_EXEC))
? PFERR_PRESENT_MASK : 0;
- if (exit_qualification & EPT_VIOLATION_GVA_IS_VALID)
- error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
- PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
+ if (mmu_has_mbec(vcpu->arch.mmu))
+ error_code |= (exit_qualification & EPT_VIOLATION_PROT_USER_EXEC)
+ ? PFERR_PRESENT_MASK : 0;
+
+ if (exit_qualification & EPT_VIOLATION_GVA_IS_VALID) {
+ if (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) {
+ error_code |= PFERR_GUEST_FINAL_MASK;
+ if (exit_qualification & EPT_VIOLATION_GVA_USER)
+ error_code |= PFERR_USER_MASK;
+ } else {
+ error_code |= PFERR_GUEST_PAGE_MASK;
+ }
+ }
if (vt_is_tdx_private_gpa(vcpu->kvm, gpa))
error_code |= PFERR_PRIVATE_ACCESS;
diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.h b/arch/x86/kvm/vmx/hyperv_evmcs.h
index fc7c4e7bd1bf..bc08fe40590e 100644
--- a/arch/x86/kvm/vmx/hyperv_evmcs.h
+++ b/arch/x86/kvm/vmx/hyperv_evmcs.h
@@ -87,6 +87,7 @@
SECONDARY_EXEC_PT_CONCEAL_VMX | \
SECONDARY_EXEC_BUS_LOCK_DETECTION | \
SECONDARY_EXEC_NOTIFY_VM_EXITING | \
+ SECONDARY_EXEC_MODE_BASED_EPT_EXEC | \
SECONDARY_EXEC_ENCLS_EXITING)
#define EVMCS1_SUPPORTED_3RDEXEC (0ULL)
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index dbebddf648be..83d9921277ea 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -755,6 +755,14 @@ static int vt_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
return vmx_set_identity_map_addr(kvm, ident_addr);
}
+static bool vt_tdp_has_smep(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ return false;
+
+ return vmx_tdp_has_smep(kvm);
+}
+
static u64 vt_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
{
/* TDX doesn't support L2 guest at the moment. */
@@ -966,6 +974,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.set_tss_addr = vt_op(set_tss_addr),
.set_identity_map_addr = vt_op(set_identity_map_addr),
.get_mt_mask = vmx_get_mt_mask,
+ .tdp_has_smep = vt_op(tdp_has_smep),
.get_exit_info = vt_op(get_exit_info),
.get_entry_info = vt_op(get_entry_info),
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 3fe88f29be7a..bc1046f32ebc 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -443,10 +443,14 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
exit_qualification = 0;
} else {
+ u64 mask = EPT_VIOLATION_GVA_IS_VALID |
+ EPT_VIOLATION_GVA_TRANSLATED;
+ if (vmx->nested.msrs.ept_caps & VMX_EPT_ADVANCED_VMEXIT_INFO_BIT)
+ mask |= EPT_VIOLATION_GVA_USER |
+ EPT_VIOLATION_GVA_WRITABLE |
+ EPT_VIOLATION_GVA_NX;
exit_qualification = fault->exit_qualification;
- exit_qualification |= vmx_get_exit_qual(vcpu) &
- (EPT_VIOLATION_GVA_IS_VALID |
- EPT_VIOLATION_GVA_TRANSLATED);
+ exit_qualification |= vmx_get_exit_qual(vcpu) & mask;
vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
}
@@ -465,6 +469,13 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
vmcs12->guest_physical_address = fault->address;
}
+static inline bool nested_ept_mbec_enabled(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC);
+}
+
static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -473,6 +484,7 @@ static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level,
nested_ept_ad_enabled(vcpu),
+ nested_ept_mbec_enabled(vcpu),
nested_ept_get_eptp(vcpu));
}
@@ -2440,6 +2452,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_ENABLE_VMFUNC |
+ SECONDARY_EXEC_MODE_BASED_EPT_EXEC |
SECONDARY_EXEC_DESC);
if (nested_cpu_has(vmcs12,
@@ -7239,7 +7252,8 @@ static void nested_vmx_setup_secondary_ctls(u32 ept_caps,
VMX_EPT_PAGE_WALK_5_BIT |
VMX_EPTP_WB_BIT |
VMX_EPT_INVEPT_BIT |
- VMX_EPT_EXECUTE_ONLY_BIT;
+ VMX_EPT_EXECUTE_ONLY_BIT |
+ VMX_EPT_ADVANCED_VMEXIT_INFO_BIT;
msrs->ept_caps &= ept_caps;
msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
@@ -7251,6 +7265,9 @@ static void nested_vmx_setup_secondary_ctls(u32 ept_caps,
msrs->ept_caps |= VMX_EPT_AD_BIT;
}
+ if (enable_mbec)
+ msrs->secondary_ctls_high |=
+ SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
/*
* Advertise EPTP switching irrespective of hardware support,
* KVM emulates it in software so long as VMFUNC is supported.
@@ -7438,8 +7455,29 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
return 0;
}
+static gpa_t vmx_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa,
+ u64 access,
+ struct x86_exception *exception,
+ u64 pte_access)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+
+ BUG_ON(!mmu_is_nested(vcpu));
+
+ /*
+ * MBEC differentiates based on the effective U/S bit of
+ * the guest page tables; not the processor CPL.
+ */
+ access &= ~PFERR_USER_MASK;
+ if ((pte_access & ACC_USER_MASK) && (access & PFERR_GUEST_FINAL_MASK))
+ access |= PFERR_USER_MASK;
+
+ return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
+}
+
struct kvm_x86_nested_ops vmx_nested_ops = {
.leave_nested = vmx_leave_nested,
+ .translate_nested_gpa = vmx_translate_nested_gpa,
.is_exception_vmexit = nested_vmx_is_exception_vmexit,
.check_events = vmx_check_nested_events,
.has_events = vmx_has_nested_events,
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 04ce321ebdf3..30cd5b19ecf9 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1845,7 +1845,7 @@ static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcp
if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
return false;
- return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
+ return !(eq & EPT_VIOLATION_PROT_MASK);
}
static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 5c2c33a5f7dc..a0a7a2f267b3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -114,6 +114,9 @@ module_param(emulate_invalid_guest_state, bool, 0444);
static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, 0444);
+bool __read_mostly enable_mbec = 1;
+module_param_named(mbec, enable_mbec, bool, 0444);
+
module_param(enable_apicv, bool, 0444);
module_param(enable_ipiv, bool, 0444);
@@ -2773,6 +2776,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
return -EIO;
vmx_cap->ept = 0;
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
}
if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
@@ -2786,6 +2790,16 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
vmx_cap->vpid = 0;
}
+ /*
+ * Virtualizing MBEC requires advanced vmexit information in order to
+ * distinguish supervisor and user accesses. For simplicity and clarity
+ * disable MBEC entirely if advanced vmexit information is not available,
+ * this way mbec=1 in the kvm_intel module parameters implies availability
+ * to nested guests as well.
+ */
+ if (!(vmx_cap->ept & VMX_EPT_ADVANCED_VMEXIT_INFO_BIT))
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
+
if (!cpu_has_sgx())
_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;
@@ -4735,6 +4749,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
*/
exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
+ if (!enable_mbec)
+ exec_control &= ~SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
+
/* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
* in vmx_set_cr4. */
exec_control &= ~SECONDARY_EXEC_DESC;
@@ -8646,6 +8663,8 @@ __init int vmx_hardware_setup(void)
if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
enable_ept_ad_bits = 0;
+ if (!cpu_has_ept_mbec() || !enable_ept)
+ enable_mbec = 0;
if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
enable_unrestricted_guest = 0;
@@ -8707,8 +8726,7 @@ __init int vmx_hardware_setup(void)
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
if (enable_ept)
- kvm_mmu_set_ept_masks(enable_ept_ad_bits,
- cpu_has_vmx_ept_execute_only());
+ kvm_mmu_set_ept_masks(enable_ept_ad_bits);
else
vt_x86_ops.get_mt_mask = NULL;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index db84e8001da5..0a4e263c4095 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -567,6 +567,7 @@ static inline u8 vmx_get_rvi(void)
SECONDARY_EXEC_ENABLE_VMFUNC | \
SECONDARY_EXEC_BUS_LOCK_DETECTION | \
SECONDARY_EXEC_NOTIFY_VM_EXITING | \
+ SECONDARY_EXEC_MODE_BASED_EPT_EXEC | \
SECONDARY_EXEC_ENCLS_EXITING | \
SECONDARY_EXEC_EPT_VIOLATION_VE)
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index d09abeac2b56..409858074246 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -4,6 +4,7 @@
#include <linux/kvm_host.h>
+#include "capabilities.h"
#include "x86.h"
__init int vmx_hardware_setup(void);
@@ -104,6 +105,11 @@ int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr);
u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+static inline bool vmx_tdp_has_smep(struct kvm *kvm)
+{
+ return enable_mbec;
+}
+
void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code);
void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0a1b63c63d1a..7c6942afae81 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1072,7 +1072,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
* to an L1 GPA.
*/
real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn),
- PFERR_USER_MASK | PFERR_WRITE_MASK, NULL);
+ PFERR_USER_MASK | PFERR_WRITE_MASK |
+ PFERR_GUEST_PAGE_MASK, NULL, 0);
if (real_gpa == INVALID_GPA)
return 0;
@@ -7847,21 +7848,6 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
kvm_x86_call(get_segment)(vcpu, var, seg);
}
-gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access,
- struct x86_exception *exception)
-{
- struct kvm_mmu *mmu = vcpu->arch.mmu;
- gpa_t t_gpa;
-
- BUG_ON(!mmu_is_nested(vcpu));
-
- /* NPT walks are always user-walks */
- access |= PFERR_USER_MASK;
- t_gpa = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
-
- return t_gpa;
-}
-
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{