Skip to content

Commit 5baa0fc

Browse files
authored
Merge pull request #154 from phcoder/w64
libco/amd64: Replace windows variant with code from higan
2 parents 97b4391 + d972593 commit 5baa0fc

1 file changed

Lines changed: 49 additions & 44 deletions

File tree

libco/amd64.c

Lines changed: 49 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -25,51 +25,56 @@ static void (*co_swap)(cothread_t, cothread_t) = 0;
2525

2626
#ifdef _WIN32
2727
/* ABI: Win64 */
28+
/* On windows handle is allocated by malloc and there it's guaranteed to
29+
have at least 16-byte alignment. Hence we don't need to align
30+
it in order to use movaps. */
2831
static unsigned char co_swap_function[] = {
29-
0x48, 0x89, 0x22, /* mov [rdx],rsp */
30-
0x48, 0x8b, 0x21, /* mov rsp,[rcx] */
31-
0x58, /* pop rax */
32-
0x48, 0x89, 0x6a, 0x08, /* mov [rdx+0x8],rbp */
33-
0x48, 0x89, 0x72, 0x10, /* mov [rdx+0x10],rsi */
34-
0x48, 0x89, 0x7a, 0x18, /* mov [rdx+0x18],rdi */
35-
0x48, 0x89, 0x5a, 0x20, /* mov [rdx+0x20],rbx */
36-
0x4c, 0x89, 0x62, 0x28, /* mov [rdx+0x28],r12 */
37-
0x4c, 0x89, 0x6a, 0x30, /* mov [rdx+0x30],r13 */
38-
0x4c, 0x89, 0x72, 0x38, /* mov [rdx+0x38],r14 */
39-
0x4c, 0x89, 0x7a, 0x40, /* mov [rdx+0x40],r15 */
40-
0x48, 0x81, 0xc2, 0x80, 0x00, 0x00, 0x00, /* add rdx,0x80 */
41-
0x48, 0x83, 0xe2, 0xf0, /* and rdx,-0x10 */
42-
0x0f, 0x29, 0x32, /* movaps [rdx],xmm6 */
43-
0x0f, 0x29, 0x7a, 0x10, /* movaps [rdx+0x10],xmm7 */
44-
0x44, 0x0f, 0x29, 0x42, 0x20, /* movaps [rdx+0x20],xmm8 */
45-
0x44, 0x0f, 0x29, 0x4a, 0x30, /* movaps [rdx+0x30],xmm9 */
46-
0x44, 0x0f, 0x29, 0x52, 0x40, /* movaps [rdx+0x40],xmm10 */
47-
0x44, 0x0f, 0x29, 0x5a, 0x50, /* movaps [rdx+0x50],xmm11 */
48-
0x44, 0x0f, 0x29, 0x62, 0x60, /* movaps [rdx+0x60],xmm12 */
49-
0x44, 0x0f, 0x29, 0x6a, 0x70, /* movaps [rdx+0x70],xmm13 */
50-
0x44, 0x0f, 0x29, 0xb2, 0x80, 0x00, 0x00, 0x00, /* movaps [rdx+0x80],xmm14 */
51-
0x44, 0x0f, 0x29, 0xba, 0x90, 0x00, 0x00, 0x00, /* movaps [rdx+0x90],xmm15 */
52-
0x48, 0x8b, 0x69, 0x08, /* mov rbp,[rcx+0x8] */
53-
0x48, 0x8b, 0x71, 0x10, /* mov rsi,[rcx+0x10] */
54-
0x48, 0x8b, 0x79, 0x18, /* mov rdi,[rcx+0x18] */
55-
0x48, 0x8b, 0x59, 0x20, /* mov rbx,[rcx+0x20] */
56-
0x4c, 0x8b, 0x61, 0x28, /* mov r12,[rcx+0x28] */
57-
0x4c, 0x8b, 0x69, 0x30, /* mov r13,[rcx+0x30] */
58-
0x4c, 0x8b, 0x71, 0x38, /* mov r14,[rcx+0x38] */
59-
0x4c, 0x8b, 0x79, 0x40, /* mov r15,[rcx+0x40] */
60-
0x48, 0x81, 0xc1, 0x80, 0x00, 0x00, 0x00, /* add rcx,0x80 */
61-
0x48, 0x83, 0xe1, 0xf0, /* and rcx,-0x10 */
62-
0x0f, 0x29, 0x31, /* movaps [rcx],xmm6 */
63-
0x0f, 0x29, 0x79, 0x10, /* movaps [rcx+0x10],xmm7 */
64-
0x44, 0x0f, 0x29, 0x41, 0x20, /* movaps [rcx+0x20],xmm8 */
65-
0x44, 0x0f, 0x29, 0x49, 0x30, /* movaps [rcx+0x30],xmm9 */
66-
0x44, 0x0f, 0x29, 0x51, 0x40, /* movaps [rcx+0x40],xmm10 */
67-
0x44, 0x0f, 0x29, 0x59, 0x50, /* movaps [rcx+0x50],xmm11 */
68-
0x44, 0x0f, 0x29, 0x61, 0x60, /* movaps [rcx+0x60],xmm12 */
69-
0x44, 0x0f, 0x29, 0x69, 0x70, /* movaps [rcx+0x70],xmm13 */
70-
0x44, 0x0f, 0x29, 0xb1, 0x80, 0x00, 0x00, 0x00, /* movaps [rcx+0x80],xmm14 */
71-
0x44, 0x0f, 0x29, 0xb9, 0x90, 0x00, 0x00, 0x00, /* movaps [rcx+0x90],xmm15 */
72-
0xff, 0xe0, /* jmp rax */
32+
0x48, 0x89, 0x22, /* mov [rdx],rsp */
33+
0x48, 0x8b, 0x21, /* mov rsp,[rcx] */
34+
0x58, /* pop rax */
35+
0x48, 0x89, 0x6a, 0x08, /* mov [rdx+ 8],rbp */
36+
0x48, 0x89, 0x72, 0x10, /* mov [rdx+16],rsi */
37+
0x48, 0x89, 0x7a, 0x18, /* mov [rdx+24],rdi */
38+
0x48, 0x89, 0x5a, 0x20, /* mov [rdx+32],rbx */
39+
0x4c, 0x89, 0x62, 0x28, /* mov [rdx+40],r12 */
40+
0x4c, 0x89, 0x6a, 0x30, /* mov [rdx+48],r13 */
41+
0x4c, 0x89, 0x72, 0x38, /* mov [rdx+56],r14 */
42+
0x4c, 0x89, 0x7a, 0x40, /* mov [rdx+64],r15 */
43+
#if !defined(LIBCO_NO_SSE)
44+
0x0f, 0x29, 0x72, 0x50, /* movaps [rdx+ 80],xmm6 */
45+
0x0f, 0x29, 0x7a, 0x60, /* movaps [rdx+ 96],xmm7 */
46+
0x44, 0x0f, 0x29, 0x42, 0x70, /* movaps [rdx+112],xmm8 */
47+
0x48, 0x83, 0xc2, 0x70, /* add rdx,112 */
48+
0x44, 0x0f, 0x29, 0x4a, 0x10, /* movaps [rdx+ 16],xmm9 */
49+
0x44, 0x0f, 0x29, 0x52, 0x20, /* movaps [rdx+ 32],xmm10 */
50+
0x44, 0x0f, 0x29, 0x5a, 0x30, /* movaps [rdx+ 48],xmm11 */
51+
0x44, 0x0f, 0x29, 0x62, 0x40, /* movaps [rdx+ 64],xmm12 */
52+
0x44, 0x0f, 0x29, 0x6a, 0x50, /* movaps [rdx+ 80],xmm13 */
53+
0x44, 0x0f, 0x29, 0x72, 0x60, /* movaps [rdx+ 96],xmm14 */
54+
0x44, 0x0f, 0x29, 0x7a, 0x70, /* movaps [rdx+112],xmm15 */
55+
#endif
56+
0x48, 0x8b, 0x69, 0x08, /* mov rbp,[rcx+ 8] */
57+
0x48, 0x8b, 0x71, 0x10, /* mov rsi,[rcx+16] */
58+
0x48, 0x8b, 0x79, 0x18, /* mov rdi,[rcx+24] */
59+
0x48, 0x8b, 0x59, 0x20, /* mov rbx,[rcx+32] */
60+
0x4c, 0x8b, 0x61, 0x28, /* mov r12,[rcx+40] */
61+
0x4c, 0x8b, 0x69, 0x30, /* mov r13,[rcx+48] */
62+
0x4c, 0x8b, 0x71, 0x38, /* mov r14,[rcx+56] */
63+
0x4c, 0x8b, 0x79, 0x40, /* mov r15,[rcx+64] */
64+
#if !defined(LIBCO_NO_SSE)
65+
0x0f, 0x28, 0x71, 0x50, /* movaps xmm6, [rcx+ 80] */
66+
0x0f, 0x28, 0x79, 0x60, /* movaps xmm7, [rcx+ 96] */
67+
0x44, 0x0f, 0x28, 0x41, 0x70, /* movaps xmm8, [rcx+112] */
68+
0x48, 0x83, 0xc1, 0x70, /* add rcx,112 */
69+
0x44, 0x0f, 0x28, 0x49, 0x10, /* movaps xmm9, [rcx+ 16] */
70+
0x44, 0x0f, 0x28, 0x51, 0x20, /* movaps xmm10,[rcx+ 32] */
71+
0x44, 0x0f, 0x28, 0x59, 0x30, /* movaps xmm11,[rcx+ 48] */
72+
0x44, 0x0f, 0x28, 0x61, 0x40, /* movaps xmm12,[rcx+ 64] */
73+
0x44, 0x0f, 0x28, 0x69, 0x50, /* movaps xmm13,[rcx+ 80] */
74+
0x44, 0x0f, 0x28, 0x71, 0x60, /* movaps xmm14,[rcx+ 96] */
75+
0x44, 0x0f, 0x28, 0x79, 0x70, /* movaps xmm15,[rcx+112] */
76+
#endif
77+
0xff, 0xe0, /* jmp rax */
7378
};
7479

7580
#include <windows.h>

0 commit comments

Comments
 (0)