Skip to content

Commit d972593

Browse files
committed
libco/amd64: Replace windows variant with code from higan
Current code does store and store for xmm registers instead of store and restore. We can fix our code but it's better to resync it with higan instead
1 parent 6a5818f commit d972593

1 file changed

Lines changed: 49 additions & 44 deletions

File tree

libco/amd64.c

Lines changed: 49 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -25,51 +25,56 @@ static void (*co_swap)(cothread_t, cothread_t) = 0;
2525

2626
#ifdef _WIN32
2727
/* ABI: Win64 */
28+
/* On windows handle is allocated by malloc and there it's guaranteed to
29+
have at least 16-byte alignment. Hence we don't need to align
30+
it in order to use movaps. */
2831
static unsigned char co_swap_function[] = {
29-
0x48, 0x89, 0x22, /* mov [rdx],rsp */
30-
0x48, 0x8b, 0x21, /* mov rsp,[rcx] */
31-
0x58, /* pop rax */
32-
0x48, 0x89, 0x6a, 0x08, /* mov [rdx+0x8],rbp */
33-
0x48, 0x89, 0x72, 0x10, /* mov [rdx+0x10],rsi */
34-
0x48, 0x89, 0x7a, 0x18, /* mov [rdx+0x18],rdi */
35-
0x48, 0x89, 0x5a, 0x20, /* mov [rdx+0x20],rbx */
36-
0x4c, 0x89, 0x62, 0x28, /* mov [rdx+0x28],r12 */
37-
0x4c, 0x89, 0x6a, 0x30, /* mov [rdx+0x30],r13 */
38-
0x4c, 0x89, 0x72, 0x38, /* mov [rdx+0x38],r14 */
39-
0x4c, 0x89, 0x7a, 0x40, /* mov [rdx+0x40],r15 */
40-
0x48, 0x81, 0xc2, 0x80, 0x00, 0x00, 0x00, /* add rdx,0x80 */
41-
0x48, 0x83, 0xe2, 0xf0, /* and rdx,-0x10 */
42-
0x0f, 0x29, 0x32, /* movaps [rdx],xmm6 */
43-
0x0f, 0x29, 0x7a, 0x10, /* movaps [rdx+0x10],xmm7 */
44-
0x44, 0x0f, 0x29, 0x42, 0x20, /* movaps [rdx+0x20],xmm8 */
45-
0x44, 0x0f, 0x29, 0x4a, 0x30, /* movaps [rdx+0x30],xmm9 */
46-
0x44, 0x0f, 0x29, 0x52, 0x40, /* movaps [rdx+0x40],xmm10 */
47-
0x44, 0x0f, 0x29, 0x5a, 0x50, /* movaps [rdx+0x50],xmm11 */
48-
0x44, 0x0f, 0x29, 0x62, 0x60, /* movaps [rdx+0x60],xmm12 */
49-
0x44, 0x0f, 0x29, 0x6a, 0x70, /* movaps [rdx+0x70],xmm13 */
50-
0x44, 0x0f, 0x29, 0xb2, 0x80, 0x00, 0x00, 0x00, /* movaps [rdx+0x80],xmm14 */
51-
0x44, 0x0f, 0x29, 0xba, 0x90, 0x00, 0x00, 0x00, /* movaps [rdx+0x90],xmm15 */
52-
0x48, 0x8b, 0x69, 0x08, /* mov rbp,[rcx+0x8] */
53-
0x48, 0x8b, 0x71, 0x10, /* mov rsi,[rcx+0x10] */
54-
0x48, 0x8b, 0x79, 0x18, /* mov rdi,[rcx+0x18] */
55-
0x48, 0x8b, 0x59, 0x20, /* mov rbx,[rcx+0x20] */
56-
0x4c, 0x8b, 0x61, 0x28, /* mov r12,[rcx+0x28] */
57-
0x4c, 0x8b, 0x69, 0x30, /* mov r13,[rcx+0x30] */
58-
0x4c, 0x8b, 0x71, 0x38, /* mov r14,[rcx+0x38] */
59-
0x4c, 0x8b, 0x79, 0x40, /* mov r15,[rcx+0x40] */
60-
0x48, 0x81, 0xc1, 0x80, 0x00, 0x00, 0x00, /* add rcx,0x80 */
61-
0x48, 0x83, 0xe1, 0xf0, /* and rcx,-0x10 */
62-
0x0f, 0x29, 0x31, /* movaps [rcx],xmm6 */
63-
0x0f, 0x29, 0x79, 0x10, /* movaps [rcx+0x10],xmm7 */
64-
0x44, 0x0f, 0x29, 0x41, 0x20, /* movaps [rcx+0x20],xmm8 */
65-
0x44, 0x0f, 0x29, 0x49, 0x30, /* movaps [rcx+0x30],xmm9 */
66-
0x44, 0x0f, 0x29, 0x51, 0x40, /* movaps [rcx+0x40],xmm10 */
67-
0x44, 0x0f, 0x29, 0x59, 0x50, /* movaps [rcx+0x50],xmm11 */
68-
0x44, 0x0f, 0x29, 0x61, 0x60, /* movaps [rcx+0x60],xmm12 */
69-
0x44, 0x0f, 0x29, 0x69, 0x70, /* movaps [rcx+0x70],xmm13 */
70-
0x44, 0x0f, 0x29, 0xb1, 0x80, 0x00, 0x00, 0x00, /* movaps [rcx+0x80],xmm14 */
71-
0x44, 0x0f, 0x29, 0xb9, 0x90, 0x00, 0x00, 0x00, /* movaps [rcx+0x90],xmm15 */
72-
0xff, 0xe0, /* jmp rax */
32+
0x48, 0x89, 0x22, /* mov [rdx],rsp */
33+
0x48, 0x8b, 0x21, /* mov rsp,[rcx] */
34+
0x58, /* pop rax */
35+
0x48, 0x89, 0x6a, 0x08, /* mov [rdx+ 8],rbp */
36+
0x48, 0x89, 0x72, 0x10, /* mov [rdx+16],rsi */
37+
0x48, 0x89, 0x7a, 0x18, /* mov [rdx+24],rdi */
38+
0x48, 0x89, 0x5a, 0x20, /* mov [rdx+32],rbx */
39+
0x4c, 0x89, 0x62, 0x28, /* mov [rdx+40],r12 */
40+
0x4c, 0x89, 0x6a, 0x30, /* mov [rdx+48],r13 */
41+
0x4c, 0x89, 0x72, 0x38, /* mov [rdx+56],r14 */
42+
0x4c, 0x89, 0x7a, 0x40, /* mov [rdx+64],r15 */
43+
#if !defined(LIBCO_NO_SSE)
44+
0x0f, 0x29, 0x72, 0x50, /* movaps [rdx+ 80],xmm6 */
45+
0x0f, 0x29, 0x7a, 0x60, /* movaps [rdx+ 96],xmm7 */
46+
0x44, 0x0f, 0x29, 0x42, 0x70, /* movaps [rdx+112],xmm8 */
47+
0x48, 0x83, 0xc2, 0x70, /* add rdx,112 */
48+
0x44, 0x0f, 0x29, 0x4a, 0x10, /* movaps [rdx+ 16],xmm9 */
49+
0x44, 0x0f, 0x29, 0x52, 0x20, /* movaps [rdx+ 32],xmm10 */
50+
0x44, 0x0f, 0x29, 0x5a, 0x30, /* movaps [rdx+ 48],xmm11 */
51+
0x44, 0x0f, 0x29, 0x62, 0x40, /* movaps [rdx+ 64],xmm12 */
52+
0x44, 0x0f, 0x29, 0x6a, 0x50, /* movaps [rdx+ 80],xmm13 */
53+
0x44, 0x0f, 0x29, 0x72, 0x60, /* movaps [rdx+ 96],xmm14 */
54+
0x44, 0x0f, 0x29, 0x7a, 0x70, /* movaps [rdx+112],xmm15 */
55+
#endif
56+
0x48, 0x8b, 0x69, 0x08, /* mov rbp,[rcx+ 8] */
57+
0x48, 0x8b, 0x71, 0x10, /* mov rsi,[rcx+16] */
58+
0x48, 0x8b, 0x79, 0x18, /* mov rdi,[rcx+24] */
59+
0x48, 0x8b, 0x59, 0x20, /* mov rbx,[rcx+32] */
60+
0x4c, 0x8b, 0x61, 0x28, /* mov r12,[rcx+40] */
61+
0x4c, 0x8b, 0x69, 0x30, /* mov r13,[rcx+48] */
62+
0x4c, 0x8b, 0x71, 0x38, /* mov r14,[rcx+56] */
63+
0x4c, 0x8b, 0x79, 0x40, /* mov r15,[rcx+64] */
64+
#if !defined(LIBCO_NO_SSE)
65+
0x0f, 0x28, 0x71, 0x50, /* movaps xmm6, [rcx+ 80] */
66+
0x0f, 0x28, 0x79, 0x60, /* movaps xmm7, [rcx+ 96] */
67+
0x44, 0x0f, 0x28, 0x41, 0x70, /* movaps xmm8, [rcx+112] */
68+
0x48, 0x83, 0xc1, 0x70, /* add rcx,112 */
69+
0x44, 0x0f, 0x28, 0x49, 0x10, /* movaps xmm9, [rcx+ 16] */
70+
0x44, 0x0f, 0x28, 0x51, 0x20, /* movaps xmm10,[rcx+ 32] */
71+
0x44, 0x0f, 0x28, 0x59, 0x30, /* movaps xmm11,[rcx+ 48] */
72+
0x44, 0x0f, 0x28, 0x61, 0x40, /* movaps xmm12,[rcx+ 64] */
73+
0x44, 0x0f, 0x28, 0x69, 0x50, /* movaps xmm13,[rcx+ 80] */
74+
0x44, 0x0f, 0x28, 0x71, 0x60, /* movaps xmm14,[rcx+ 96] */
75+
0x44, 0x0f, 0x28, 0x79, 0x70, /* movaps xmm15,[rcx+112] */
76+
#endif
77+
0xff, 0xe0, /* jmp rax */
7378
};
7479

7580
#include <windows.h>

0 commit comments

Comments
 (0)