Skip to content

Commit 0bb933a

Browse files
committed
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull x86 kvm fixes from Paolo Bonzini: - Avoid freeing stack-allocated node in kvm_async_pf_queue_task - Clear XSTATE_BV[i] in guest XSAVE state whenever XFD[i]=1 * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: selftests: kvm: Verify TILELOADD actually #NM faults when XFD[18]=1 selftests: kvm: try getting XFD and XSAVE state out of sync selftests: kvm: replace numbered sync points with actions x86/fpu: Clear XSTATE_BV[i] in guest XSAVE state whenever XFD[i]=1 x86/kvm: Avoid freeing stack-allocated node in kvm_async_pf_queue_task
2 parents afd12f9 + 3611ca7 commit 0bb933a

4 files changed

Lines changed: 139 additions & 65 deletions

File tree

arch/x86/kernel/fpu/core.c

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -319,10 +319,29 @@ EXPORT_SYMBOL_FOR_KVM(fpu_enable_guest_xfd_features);
319319
#ifdef CONFIG_X86_64
320320
void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd)
321321
{
322+
struct fpstate *fpstate = guest_fpu->fpstate;
323+
322324
fpregs_lock();
323-
guest_fpu->fpstate->xfd = xfd;
324-
if (guest_fpu->fpstate->in_use)
325-
xfd_update_state(guest_fpu->fpstate);
325+
326+
/*
327+
* KVM's guest ABI is that setting XFD[i]=1 *can* immediately revert the
328+
* save state to its initial configuration. Likewise, KVM_GET_XSAVE does
329+
* the same as XSAVE and returns XSTATE_BV[i]=0 whenever XFD[i]=1.
330+
*
331+
* If the guest's FPU state is in hardware, just update XFD: the XSAVE
332+
* in fpu_swap_kvm_fpstate will clear XSTATE_BV[i] whenever XFD[i]=1.
333+
*
334+
* If however the guest's FPU state is NOT resident in hardware, clear
335+
* disabled components in XSTATE_BV now, or a subsequent XRSTOR will
336+
* attempt to load disabled components and generate #NM _in the host_.
337+
*/
338+
if (xfd && test_thread_flag(TIF_NEED_FPU_LOAD))
339+
fpstate->regs.xsave.header.xfeatures &= ~xfd;
340+
341+
fpstate->xfd = xfd;
342+
if (fpstate->in_use)
343+
xfd_update_state(fpstate);
344+
326345
fpregs_unlock();
327346
}
328347
EXPORT_SYMBOL_FOR_KVM(fpu_update_guest_xfd);
@@ -430,6 +449,13 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
430449
if (ustate->xsave.header.xfeatures & ~xcr0)
431450
return -EINVAL;
432451

452+
/*
453+
* Disabled features must be in their initial state, otherwise XRSTOR
454+
* causes an exception.
455+
*/
456+
if (WARN_ON_ONCE(ustate->xsave.header.xfeatures & kstate->xfd))
457+
return -EINVAL;
458+
433459
/*
434460
* Nullify @vpkru to preserve its current value if PKRU's bit isn't set
435461
* in the header. KVM's odd ABI is to leave PKRU untouched in this

arch/x86/kernel/kvm.c

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ struct kvm_task_sleep_node {
8989
struct swait_queue_head wq;
9090
u32 token;
9191
int cpu;
92+
bool dummy;
9293
};
9394

9495
static struct kvm_task_sleep_head {
@@ -120,15 +121,26 @@ static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
120121
raw_spin_lock(&b->lock);
121122
e = _find_apf_task(b, token);
122123
if (e) {
123-
/* dummy entry exist -> wake up was delivered ahead of PF */
124-
hlist_del(&e->link);
124+
struct kvm_task_sleep_node *dummy = NULL;
125+
126+
/*
127+
* The entry can either be a 'dummy' entry (which is put on the
128+
* list when wake-up happens ahead of APF handling completion)
129+
* or a token from another task which should not be touched.
130+
*/
131+
if (e->dummy) {
132+
hlist_del(&e->link);
133+
dummy = e;
134+
}
135+
125136
raw_spin_unlock(&b->lock);
126-
kfree(e);
137+
kfree(dummy);
127138
return false;
128139
}
129140

130141
n->token = token;
131142
n->cpu = smp_processor_id();
143+
n->dummy = false;
132144
init_swait_queue_head(&n->wq);
133145
hlist_add_head(&n->link, &b->list);
134146
raw_spin_unlock(&b->lock);
@@ -231,6 +243,7 @@ static void kvm_async_pf_task_wake(u32 token)
231243
}
232244
dummy->token = token;
233245
dummy->cpu = smp_processor_id();
246+
dummy->dummy = true;
234247
init_swait_queue_head(&dummy->wq);
235248
hlist_add_head(&dummy->link, &b->list);
236249
dummy = NULL;

arch/x86/kvm/x86.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5807,9 +5807,18 @@ static int kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
58075807
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
58085808
struct kvm_xsave *guest_xsave)
58095809
{
5810+
union fpregs_state *xstate = (union fpregs_state *)guest_xsave->region;
5811+
58105812
if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
58115813
return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
58125814

5815+
/*
5816+
* For backwards compatibility, do not expect disabled features to be in
5817+
* their initial state. XSTATE_BV[i] must still be cleared whenever
5818+
* XFD[i]=1, or XRSTOR would cause a #NM.
5819+
*/
5820+
xstate->xsave.header.xfeatures &= ~vcpu->arch.guest_fpu.fpstate->xfd;
5821+
58135822
return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
58145823
guest_xsave->region,
58155824
kvm_caps.supported_xcr0,

tools/testing/selftests/kvm/x86/amx_test.c

Lines changed: 85 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,12 @@ static inline void __tileloadd(void *tile)
6969
: : "a"(tile), "d"(0));
7070
}
7171

72+
static inline int tileloadd_safe(void *tile)
73+
{
74+
return kvm_asm_safe(".byte 0xc4,0xe2,0x7b,0x4b,0x04,0x10",
75+
"a"(tile), "d"(0));
76+
}
77+
7278
static inline void __tilerelease(void)
7379
{
7480
asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0" ::);
@@ -124,27 +130,52 @@ static void set_tilecfg(struct tile_config *cfg)
124130
}
125131
}
126132

133+
enum {
134+
/* Retrieve TMM0 from guest, stash it for TEST_RESTORE_TILEDATA */
135+
TEST_SAVE_TILEDATA = 1,
136+
137+
/* Check TMM0 against tiledata */
138+
TEST_COMPARE_TILEDATA = 2,
139+
140+
/* Restore TMM0 from earlier save */
141+
TEST_RESTORE_TILEDATA = 4,
142+
143+
/* Full VM save/restore */
144+
TEST_SAVE_RESTORE = 8,
145+
};
146+
127147
static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
128148
struct tile_data *tiledata,
129149
struct xstate *xstate)
130150
{
151+
int vector;
152+
131153
GUEST_ASSERT(this_cpu_has(X86_FEATURE_XSAVE) &&
132154
this_cpu_has(X86_FEATURE_OSXSAVE));
133155
check_xtile_info();
134-
GUEST_SYNC(1);
156+
GUEST_SYNC(TEST_SAVE_RESTORE);
135157

136158
/* xfd=0, enable amx */
137159
wrmsr(MSR_IA32_XFD, 0);
138-
GUEST_SYNC(2);
160+
GUEST_SYNC(TEST_SAVE_RESTORE);
139161
GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == 0);
140162
set_tilecfg(amx_cfg);
141163
__ldtilecfg(amx_cfg);
142-
GUEST_SYNC(3);
164+
GUEST_SYNC(TEST_SAVE_RESTORE);
143165
/* Check save/restore when trap to userspace */
144166
__tileloadd(tiledata);
145-
GUEST_SYNC(4);
167+
GUEST_SYNC(TEST_SAVE_TILEDATA | TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE);
168+
169+
/* xfd=0x40000, disable amx tiledata */
170+
wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILE_DATA);
171+
172+
/* host tries setting tiledata while guest XFD is set */
173+
GUEST_SYNC(TEST_RESTORE_TILEDATA);
174+
GUEST_SYNC(TEST_SAVE_RESTORE);
175+
176+
wrmsr(MSR_IA32_XFD, 0);
146177
__tilerelease();
147-
GUEST_SYNC(5);
178+
GUEST_SYNC(TEST_SAVE_RESTORE);
148179
/*
149180
* After XSAVEC, XTILEDATA is cleared in the xstate_bv but is set in
150181
* the xcomp_bv.
@@ -154,6 +185,8 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
154185
GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA));
155186
GUEST_ASSERT(xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA);
156187

188+
/* #NM test */
189+
157190
/* xfd=0x40000, disable amx tiledata */
158191
wrmsr(MSR_IA32_XFD, XFEATURE_MASK_XTILE_DATA);
159192

@@ -166,32 +199,33 @@ static void __attribute__((__flatten__)) guest_code(struct tile_config *amx_cfg,
166199
GUEST_ASSERT(!(xstate->header.xstate_bv & XFEATURE_MASK_XTILE_DATA));
167200
GUEST_ASSERT((xstate->header.xcomp_bv & XFEATURE_MASK_XTILE_DATA));
168201

169-
GUEST_SYNC(6);
202+
GUEST_SYNC(TEST_SAVE_RESTORE);
170203
GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
171204
set_tilecfg(amx_cfg);
172205
__ldtilecfg(amx_cfg);
173-
/* Trigger #NM exception */
174-
__tileloadd(tiledata);
175-
GUEST_SYNC(10);
176206

177-
GUEST_DONE();
178-
}
207+
/* Trigger #NM exception */
208+
vector = tileloadd_safe(tiledata);
209+
__GUEST_ASSERT(vector == NM_VECTOR,
210+
"Wanted #NM on tileloadd with XFD[18]=1, got %s",
211+
ex_str(vector));
179212

180-
void guest_nm_handler(struct ex_regs *regs)
181-
{
182-
/* Check if #NM is triggered by XFEATURE_MASK_XTILE_DATA */
183-
GUEST_SYNC(7);
184213
GUEST_ASSERT(!(get_cr0() & X86_CR0_TS));
185214
GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA);
186215
GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
187-
GUEST_SYNC(8);
216+
GUEST_SYNC(TEST_SAVE_RESTORE);
188217
GUEST_ASSERT(rdmsr(MSR_IA32_XFD_ERR) == XFEATURE_MASK_XTILE_DATA);
189218
GUEST_ASSERT(rdmsr(MSR_IA32_XFD) == XFEATURE_MASK_XTILE_DATA);
190219
/* Clear xfd_err */
191220
wrmsr(MSR_IA32_XFD_ERR, 0);
192221
/* xfd=0, enable amx */
193222
wrmsr(MSR_IA32_XFD, 0);
194-
GUEST_SYNC(9);
223+
GUEST_SYNC(TEST_SAVE_RESTORE);
224+
225+
__tileloadd(tiledata);
226+
GUEST_SYNC(TEST_COMPARE_TILEDATA | TEST_SAVE_RESTORE);
227+
228+
GUEST_DONE();
195229
}
196230

197231
int main(int argc, char *argv[])
@@ -200,10 +234,10 @@ int main(int argc, char *argv[])
200234
struct kvm_vcpu *vcpu;
201235
struct kvm_vm *vm;
202236
struct kvm_x86_state *state;
237+
struct kvm_x86_state *tile_state = NULL;
203238
int xsave_restore_size;
204239
vm_vaddr_t amx_cfg, tiledata, xstate;
205240
struct ucall uc;
206-
u32 amx_offset;
207241
int ret;
208242

209243
/*
@@ -228,9 +262,6 @@ int main(int argc, char *argv[])
228262

229263
vcpu_regs_get(vcpu, &regs1);
230264

231-
/* Register #NM handler */
232-
vm_install_exception_handler(vm, NM_VECTOR, guest_nm_handler);
233-
234265
/* amx cfg for guest_code */
235266
amx_cfg = vm_vaddr_alloc_page(vm);
236267
memset(addr_gva2hva(vm, amx_cfg), 0x0, getpagesize());
@@ -244,6 +275,7 @@ int main(int argc, char *argv[])
244275
memset(addr_gva2hva(vm, xstate), 0, PAGE_SIZE * DIV_ROUND_UP(XSAVE_SIZE, PAGE_SIZE));
245276
vcpu_args_set(vcpu, 3, amx_cfg, tiledata, xstate);
246277

278+
int iter = 0;
247279
for (;;) {
248280
vcpu_run(vcpu);
249281
TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
@@ -253,37 +285,47 @@ int main(int argc, char *argv[])
253285
REPORT_GUEST_ASSERT(uc);
254286
/* NOT REACHED */
255287
case UCALL_SYNC:
256-
switch (uc.args[1]) {
257-
case 1:
258-
case 2:
259-
case 3:
260-
case 5:
261-
case 6:
262-
case 7:
263-
case 8:
264-
fprintf(stderr, "GUEST_SYNC(%ld)\n", uc.args[1]);
265-
break;
266-
case 4:
267-
case 10:
268-
fprintf(stderr,
269-
"GUEST_SYNC(%ld), check save/restore status\n", uc.args[1]);
288+
++iter;
289+
if (uc.args[1] & TEST_SAVE_TILEDATA) {
290+
fprintf(stderr, "GUEST_SYNC #%d, save tiledata\n", iter);
291+
tile_state = vcpu_save_state(vcpu);
292+
}
293+
if (uc.args[1] & TEST_COMPARE_TILEDATA) {
294+
fprintf(stderr, "GUEST_SYNC #%d, check TMM0 contents\n", iter);
270295

271296
/* Compacted mode, get amx offset by xsave area
272297
* size subtract 8K amx size.
273298
*/
274-
amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE;
275-
state = vcpu_save_state(vcpu);
276-
void *amx_start = (void *)state->xsave + amx_offset;
299+
u32 amx_offset = xsave_restore_size - NUM_TILES*TILE_SIZE;
300+
void *amx_start = (void *)tile_state->xsave + amx_offset;
277301
void *tiles_data = (void *)addr_gva2hva(vm, tiledata);
278302
/* Only check TMM0 register, 1 tile */
279303
ret = memcmp(amx_start, tiles_data, TILE_SIZE);
280304
TEST_ASSERT(ret == 0, "memcmp failed, ret=%d", ret);
305+
}
306+
if (uc.args[1] & TEST_RESTORE_TILEDATA) {
307+
fprintf(stderr, "GUEST_SYNC #%d, before KVM_SET_XSAVE\n", iter);
308+
vcpu_xsave_set(vcpu, tile_state->xsave);
309+
fprintf(stderr, "GUEST_SYNC #%d, after KVM_SET_XSAVE\n", iter);
310+
}
311+
if (uc.args[1] & TEST_SAVE_RESTORE) {
312+
fprintf(stderr, "GUEST_SYNC #%d, save/restore VM state\n", iter);
313+
state = vcpu_save_state(vcpu);
314+
memset(&regs1, 0, sizeof(regs1));
315+
vcpu_regs_get(vcpu, &regs1);
316+
317+
kvm_vm_release(vm);
318+
319+
/* Restore state in a new VM. */
320+
vcpu = vm_recreate_with_one_vcpu(vm);
321+
vcpu_load_state(vcpu, state);
281322
kvm_x86_state_cleanup(state);
282-
break;
283-
case 9:
284-
fprintf(stderr,
285-
"GUEST_SYNC(%ld), #NM exception and enable amx\n", uc.args[1]);
286-
break;
323+
324+
memset(&regs2, 0, sizeof(regs2));
325+
vcpu_regs_get(vcpu, &regs2);
326+
TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
327+
"Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
328+
(ulong) regs2.rdi, (ulong) regs2.rsi);
287329
}
288330
break;
289331
case UCALL_DONE:
@@ -293,22 +335,6 @@ int main(int argc, char *argv[])
293335
TEST_FAIL("Unknown ucall %lu", uc.cmd);
294336
}
295337

296-
state = vcpu_save_state(vcpu);
297-
memset(&regs1, 0, sizeof(regs1));
298-
vcpu_regs_get(vcpu, &regs1);
299-
300-
kvm_vm_release(vm);
301-
302-
/* Restore state in a new VM. */
303-
vcpu = vm_recreate_with_one_vcpu(vm);
304-
vcpu_load_state(vcpu, state);
305-
kvm_x86_state_cleanup(state);
306-
307-
memset(&regs2, 0, sizeof(regs2));
308-
vcpu_regs_get(vcpu, &regs2);
309-
TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
310-
"Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
311-
(ulong) regs2.rdi, (ulong) regs2.rsi);
312338
}
313339
done:
314340
kvm_vm_free(vm);

0 commit comments

Comments
 (0)