Skip to content

Commit d7f4aac

Browse files
committed
Merge tag 'kvm-x86-mmu-6.17' of https://github.com/kvm-x86/linux into HEAD
KVM x86 MMU changes for 6.17 - Exempt nested EPT from the the !USER + CR0.WP logic, as EPT doesn't interact with CR0.WP. - Move the TDX hardware setup code to tdx.c to better co-locate TDX code and eliminate a few global symbols. - Dynamically allocation the shadow MMU's hashed page list, and defer allocating the hashed list until it's actually needed (the TDP MMU doesn't use the list).
2 parents 1a14928 + 9c4fe6d commit d7f4aac

11 files changed

Lines changed: 145 additions & 69 deletions

File tree

arch/x86/include/asm/kvm_host.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1358,7 +1358,7 @@ struct kvm_arch {
13581358
bool has_private_mem;
13591359
bool has_protected_state;
13601360
bool pre_fault_allowed;
1361-
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
1361+
struct hlist_head *mmu_page_hash;
13621362
struct list_head active_mmu_pages;
13631363
/*
13641364
* A list of kvm_mmu_page structs that, if zapped, could possibly be
@@ -1985,7 +1985,7 @@ void kvm_x86_vendor_exit(void);
19851985
#define __KVM_HAVE_ARCH_VM_ALLOC
19861986
static inline struct kvm *kvm_arch_alloc_vm(void)
19871987
{
1988-
return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1988+
return kvzalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT);
19891989
}
19901990

19911991
#define __KVM_HAVE_ARCH_VM_FREE
@@ -2030,7 +2030,7 @@ void kvm_mmu_vendor_module_exit(void);
20302030

20312031
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
20322032
int kvm_mmu_create(struct kvm_vcpu *vcpu);
2033-
void kvm_mmu_init_vm(struct kvm *kvm);
2033+
int kvm_mmu_init_vm(struct kvm *kvm);
20342034
void kvm_mmu_uninit_vm(struct kvm *kvm);
20352035

20362036
void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,

arch/x86/kvm/mmu/mmu.c

Lines changed: 69 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1983,14 +1983,35 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp)
19831983
return true;
19841984
}
19851985

1986+
static __ro_after_init HLIST_HEAD(empty_page_hash);
1987+
1988+
static struct hlist_head *kvm_get_mmu_page_hash(struct kvm *kvm, gfn_t gfn)
1989+
{
1990+
/*
1991+
* Ensure the load of the hash table pointer itself is ordered before
1992+
* loads to walk the table. The pointer is set at runtime outside of
1993+
* mmu_lock when the TDP MMU is enabled, i.e. when the hash table of
1994+
* shadow pages becomes necessary only when KVM needs to shadow L1's
1995+
* TDP for an L2 guest. Pairs with the smp_store_release() in
1996+
* kvm_mmu_alloc_page_hash().
1997+
*/
1998+
struct hlist_head *page_hash = smp_load_acquire(&kvm->arch.mmu_page_hash);
1999+
2000+
lockdep_assert_held(&kvm->mmu_lock);
2001+
2002+
if (!page_hash)
2003+
return &empty_page_hash;
2004+
2005+
return &page_hash[kvm_page_table_hashfn(gfn)];
2006+
}
2007+
19862008
#define for_each_valid_sp(_kvm, _sp, _list) \
19872009
hlist_for_each_entry(_sp, _list, hash_link) \
19882010
if (is_obsolete_sp((_kvm), (_sp))) { \
19892011
} else
19902012

19912013
#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \
1992-
for_each_valid_sp(_kvm, _sp, \
1993-
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
2014+
for_each_valid_sp(_kvm, _sp, kvm_get_mmu_page_hash(_kvm, _gfn)) \
19942015
if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
19952016

19962017
static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
@@ -2358,6 +2379,12 @@ static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm,
23582379
struct kvm_mmu_page *sp;
23592380
bool created = false;
23602381

2382+
/*
2383+
* No need for memory barriers, unlike in kvm_get_mmu_page_hash(), as
2384+
* mmu_page_hash must be set prior to creating the first shadow root,
2385+
* i.e. reaching this point is fully serialized by slots_arch_lock.
2386+
*/
2387+
BUG_ON(!kvm->arch.mmu_page_hash);
23612388
sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
23622389

23632390
sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
@@ -3882,6 +3909,28 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
38823909
return r;
38833910
}
38843911

3912+
static int kvm_mmu_alloc_page_hash(struct kvm *kvm)
3913+
{
3914+
struct hlist_head *h;
3915+
3916+
if (kvm->arch.mmu_page_hash)
3917+
return 0;
3918+
3919+
h = kvcalloc(KVM_NUM_MMU_PAGES, sizeof(*h), GFP_KERNEL_ACCOUNT);
3920+
if (!h)
3921+
return -ENOMEM;
3922+
3923+
/*
3924+
* Ensure the hash table pointer is set only after all stores to zero
3925+
* the memory are retired. Pairs with the smp_load_acquire() in
3926+
* kvm_get_mmu_page_hash(). Note, mmu_lock must be held for write to
3927+
* add (or remove) shadow pages, and so readers are guaranteed to see
3928+
* an empty list for their current mmu_lock critical section.
3929+
*/
3930+
smp_store_release(&kvm->arch.mmu_page_hash, h);
3931+
return 0;
3932+
}
3933+
38853934
static int mmu_first_shadow_root_alloc(struct kvm *kvm)
38863935
{
38873936
struct kvm_memslots *slots;
@@ -3901,9 +3950,13 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm)
39013950
if (kvm_shadow_root_allocated(kvm))
39023951
goto out_unlock;
39033952

3953+
r = kvm_mmu_alloc_page_hash(kvm);
3954+
if (r)
3955+
goto out_unlock;
3956+
39043957
/*
3905-
* Check if anything actually needs to be allocated, e.g. all metadata
3906-
* will be allocated upfront if TDP is disabled.
3958+
* Check if memslot metadata actually needs to be allocated, e.g. all
3959+
* metadata will be allocated upfront if TDP is disabled.
39073960
*/
39083961
if (kvm_memslots_have_rmaps(kvm) &&
39093962
kvm_page_track_write_tracking_enabled(kvm))
@@ -6682,15 +6735,22 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
66826735
kvm_tdp_mmu_zap_invalidated_roots(kvm, true);
66836736
}
66846737

6685-
void kvm_mmu_init_vm(struct kvm *kvm)
6738+
int kvm_mmu_init_vm(struct kvm *kvm)
66866739
{
6740+
int r;
6741+
66876742
kvm->arch.shadow_mmio_value = shadow_mmio_value;
66886743
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
66896744
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
66906745
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
66916746

6692-
if (tdp_mmu_enabled)
6747+
if (tdp_mmu_enabled) {
66936748
kvm_mmu_init_tdp_mmu(kvm);
6749+
} else {
6750+
r = kvm_mmu_alloc_page_hash(kvm);
6751+
if (r)
6752+
return r;
6753+
}
66946754

66956755
kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
66966756
kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
@@ -6699,6 +6759,7 @@ void kvm_mmu_init_vm(struct kvm *kvm)
66996759

67006760
kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
67016761
kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
6762+
return 0;
67026763
}
67036764

67046765
static void mmu_free_vm_memory_caches(struct kvm *kvm)
@@ -6710,6 +6771,8 @@ static void mmu_free_vm_memory_caches(struct kvm *kvm)
67106771

67116772
void kvm_mmu_uninit_vm(struct kvm *kvm)
67126773
{
6774+
kvfree(kvm->arch.mmu_page_hash);
6775+
67136776
if (tdp_mmu_enabled)
67146777
kvm_mmu_uninit_tdp_mmu(kvm);
67156778

arch/x86/kvm/mmu/paging_tmpl.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -804,9 +804,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
804804
if (r != RET_PF_CONTINUE)
805805
return r;
806806

807+
#if PTTYPE != PTTYPE_EPT
807808
/*
808-
* Do not change pte_access if the pfn is a mmio page, otherwise
809-
* we will cache the incorrect access into mmio spte.
809+
* Treat the guest PTE protections as writable, supervisor-only if this
810+
* is a supervisor write fault and CR0.WP=0 (supervisor accesses ignore
811+
* PTE.W if CR0.WP=0). Don't change the access type for emulated MMIO,
812+
* otherwise KVM will cache incorrect access information in the SPTE.
810813
*/
811814
if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) &&
812815
!is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) {
@@ -822,6 +825,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
822825
if (is_cr4_smep(vcpu->arch.mmu))
823826
walker.pte_access &= ~ACC_EXEC_MASK;
824827
}
828+
#endif
825829

826830
r = RET_PF_RETRY;
827831
write_lock(&vcpu->kvm->mmu_lock);

arch/x86/kvm/svm/svm.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5494,6 +5494,8 @@ static int __init svm_init(void)
54945494
{
54955495
int r;
54965496

5497+
KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_svm);
5498+
54975499
__unused_size_checks();
54985500

54995501
if (!kvm_is_svm_supported())

arch/x86/kvm/vmx/main.c

Lines changed: 2 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -29,40 +29,8 @@ static __init int vt_hardware_setup(void)
2929
if (ret)
3030
return ret;
3131

32-
/*
33-
* Update vt_x86_ops::vm_size here so it is ready before
34-
* kvm_ops_update() is called in kvm_x86_vendor_init().
35-
*
36-
* Note, the actual bringing up of TDX must be done after
37-
* kvm_ops_update() because enabling TDX requires enabling
38-
* hardware virtualization first, i.e., all online CPUs must
39-
* be in post-VMXON state. This means the @vm_size here
40-
* may be updated to TDX's size but TDX may fail to enable
41-
* at later time.
42-
*
43-
* The VMX/VT code could update kvm_x86_ops::vm_size again
44-
* after bringing up TDX, but this would require exporting
45-
* either kvm_x86_ops or kvm_ops_update() from the base KVM
46-
* module, which looks overkill. Anyway, the worst case here
47-
* is KVM may allocate couple of more bytes than needed for
48-
* each VM.
49-
*/
50-
if (enable_tdx) {
51-
vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size,
52-
sizeof(struct kvm_tdx));
53-
/*
54-
* Note, TDX may fail to initialize in a later time in
55-
* vt_init(), in which case it is not necessary to setup
56-
* those callbacks. But making them valid here even
57-
* when TDX fails to init later is fine because those
58-
* callbacks won't be called if the VM isn't TDX guest.
59-
*/
60-
vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
61-
vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
62-
vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
63-
vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
64-
vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
65-
}
32+
if (enable_tdx)
33+
tdx_hardware_setup();
6634

6735
return 0;
6836
}

arch/x86/kvm/vmx/tdx.c

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -743,7 +743,7 @@ bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
743743
!to_tdx(vcpu)->vp_enter_args.r12;
744744
}
745745

746-
bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
746+
static bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
747747
{
748748
u64 vcpu_state_details;
749749

@@ -1638,8 +1638,8 @@ static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
16381638
return 0;
16391639
}
16401640

1641-
int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
1642-
enum pg_level level, kvm_pfn_t pfn)
1641+
static int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
1642+
enum pg_level level, kvm_pfn_t pfn)
16431643
{
16441644
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
16451645
struct page *page = pfn_to_page(pfn);
@@ -1719,8 +1719,8 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
17191719
return 0;
17201720
}
17211721

1722-
int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
1723-
enum pg_level level, void *private_spt)
1722+
static int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
1723+
enum pg_level level, void *private_spt)
17241724
{
17251725
int tdx_level = pg_level_to_tdx_sept_level(level);
17261726
gpa_t gpa = gfn_to_gpa(gfn);
@@ -1855,8 +1855,8 @@ static void tdx_track(struct kvm *kvm)
18551855
kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
18561856
}
18571857

1858-
int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
1859-
enum pg_level level, void *private_spt)
1858+
static int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
1859+
enum pg_level level, void *private_spt)
18601860
{
18611861
struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
18621862

@@ -1878,8 +1878,8 @@ int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
18781878
return tdx_reclaim_page(virt_to_page(private_spt));
18791879
}
18801880

1881-
int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
1882-
enum pg_level level, kvm_pfn_t pfn)
1881+
static int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
1882+
enum pg_level level, kvm_pfn_t pfn)
18831883
{
18841884
struct page *page = pfn_to_page(pfn);
18851885
int ret;
@@ -3603,10 +3603,14 @@ int __init tdx_bringup(void)
36033603
r = __tdx_bringup();
36043604
if (r) {
36053605
/*
3606-
* Disable TDX only but don't fail to load module if
3607-
* the TDX module could not be loaded. No need to print
3608-
* message saying "module is not loaded" because it was
3609-
* printed when the first SEAMCALL failed.
3606+
* Disable TDX only but don't fail to load module if the TDX
3607+
* module could not be loaded. No need to print message saying
3608+
* "module is not loaded" because it was printed when the first
3609+
* SEAMCALL failed. Don't bother unwinding the S-EPT hooks or
3610+
* vm_size, as kvm_x86_ops have already been finalized (and are
3611+
* intentionally not exported). The S-EPT code is unreachable,
3612+
* and allocating a few more bytes per VM in a should-be-rare
3613+
* failure scenario is a non-issue.
36103614
*/
36113615
if (r == -ENODEV)
36123616
goto success_disable_tdx;
@@ -3620,3 +3624,20 @@ int __init tdx_bringup(void)
36203624
enable_tdx = 0;
36213625
return 0;
36223626
}
3627+
3628+
void __init tdx_hardware_setup(void)
3629+
{
3630+
KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_tdx);
3631+
3632+
/*
3633+
* Note, if the TDX module can't be loaded, KVM TDX support will be
3634+
* disabled but KVM will continue loading (see tdx_bringup()).
3635+
*/
3636+
vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size, sizeof(struct kvm_tdx));
3637+
3638+
vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
3639+
vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
3640+
vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
3641+
vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
3642+
vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
3643+
}

arch/x86/kvm/vmx/tdx.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#ifdef CONFIG_KVM_INTEL_TDX
99
#include "common.h"
1010

11+
void tdx_hardware_setup(void);
1112
int tdx_bringup(void);
1213
void tdx_cleanup(void);
1314

arch/x86/kvm/vmx/vmx.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8552,6 +8552,8 @@ int __init vmx_init(void)
85528552
{
85538553
int r, cpu;
85548554

8555+
KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx);
8556+
85558557
if (!kvm_is_vmx_supported())
85568558
return -EOPNOTSUPP;
85578559

arch/x86/kvm/vmx/x86_ops.h

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,6 @@ int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu);
136136
fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags);
137137
void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
138138
void tdx_vcpu_put(struct kvm_vcpu *vcpu);
139-
bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu);
140139
int tdx_handle_exit(struct kvm_vcpu *vcpu,
141140
enum exit_fastpath_completion fastpath);
142141

@@ -151,15 +150,6 @@ int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
151150

152151
int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
153152

154-
int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
155-
enum pg_level level, void *private_spt);
156-
int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
157-
enum pg_level level, void *private_spt);
158-
int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
159-
enum pg_level level, kvm_pfn_t pfn);
160-
int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
161-
enum pg_level level, kvm_pfn_t pfn);
162-
163153
void tdx_flush_tlb_current(struct kvm_vcpu *vcpu);
164154
void tdx_flush_tlb_all(struct kvm_vcpu *vcpu);
165155
void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);

arch/x86/kvm/x86.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12699,7 +12699,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
1269912699
if (ret)
1270012700
goto out;
1270112701

12702-
kvm_mmu_init_vm(kvm);
12702+
ret = kvm_mmu_init_vm(kvm);
12703+
if (ret)
12704+
goto out_cleanup_page_track;
1270312705

1270412706
ret = kvm_x86_call(vm_init)(kvm);
1270512707
if (ret)
@@ -12745,6 +12747,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
1274512747

1274612748
out_uninit_mmu:
1274712749
kvm_mmu_uninit_vm(kvm);
12750+
out_cleanup_page_track:
1274812751
kvm_page_track_cleanup(kvm);
1274912752
out:
1275012753
return ret;

0 commit comments

Comments
 (0)