6 typedef void (*testfn
)(void);
9 uint64_t q0
, q1
, q2
, q3
;
10 } __attribute__((aligned(32))) v4di
;
34 static void dump_ymm(const char *name
, int n
, const v4di
*r
, int ff
)
36 printf("%s%d = %016lx %016lx %016lx %016lx\n",
37 name
, n
, r
->q3
, r
->q2
, r
->q1
, r
->q0
);
40 memcpy(v
, r
, sizeof(v
));
41 printf(" %16g %16g %16g %16g\n",
42 v
[3], v
[2], v
[1], v
[0]);
43 } else if (ff
== 32) {
45 memcpy(v
, r
, sizeof(v
));
46 printf(" %8g %8g %8g %8g %8g %8g %8g %8g\n",
47 v
[7], v
[6], v
[5], v
[4], v
[3], v
[2], v
[1], v
[0]);
51 static void dump_regs(reg_state
*s
)
55 for (i
= 0; i
< 16; i
++) {
56 dump_ymm("ymm", i
, &s
->ymm
[i
], 0);
58 for (i
= 0; i
< 4; i
++) {
59 dump_ymm("mem", i
, &s
->mem0
[i
], 0);
63 static void compare_state(const reg_state
*a
, const reg_state
*b
)
66 for (i
= 0; i
< 8; i
++) {
67 if (a
->mm
[i
] != b
->mm
[i
]) {
68 printf("MM%d = %016lx\n", i
, b
->mm
[i
]);
71 for (i
= 0; i
< 16; i
++) {
72 if (a
->r
[i
] != b
->r
[i
]) {
73 printf("r%d = %016lx\n", i
, b
->r
[i
]);
76 for (i
= 0; i
< 16; i
++) {
77 if (memcmp(&a
->ymm
[i
], &b
->ymm
[i
], 32)) {
78 dump_ymm("ymm", i
, &b
->ymm
[i
], a
->ff
);
81 for (i
= 0; i
< 4; i
++) {
82 if (memcmp(&a
->mem0
[i
], &a
->mem
[i
], 32)) {
83 dump_ymm("mem", i
, &a
->mem
[i
], a
->ff
);
86 if (a
->flags
!= b
->flags
) {
87 printf("FLAGS = %016lx\n", b
->flags
);
91 #define LOADMM(r, o) "movq " #r ", " #o "[%0]\n\t"
92 #define LOADYMM(r, o) "vmovdqa " #r ", " #o "[%0]\n\t"
93 #define STOREMM(r, o) "movq " #o "[%1], " #r "\n\t"
94 #define STOREYMM(r, o) "vmovdqa " #o "[%1], " #r "\n\t"
121 #define LOADREG(r, o) "mov " #r ", " #o "[rax]\n\t"
122 #define STOREREG(r, o) "mov " #o "[rax], " #r "\n\t"
138 static void run_test(const TestDef *t)
141 reg_state
*init
= t
->init
;
142 memcpy(init
->mem
, init
->mem0
, sizeof(init
->mem
));
143 printf("%5d %s\n", t
->n
, t
->s
);
159 "mov rcx, 0x2c0[rax]\n\t"
165 "mov rax, 0x240[rax]\n\t"
168 "mov rax, 8[rsp]\n\t"
171 "mov 0x240[rax], rbx\n\t"
173 "mov 0x270[rax], rbx\n\t"
174 "mov 0x278[rax], rbx\n\t"
178 "mov 0x2c0[rax], rbx\n\t"
187 : : "r"(init
), "r"(&result
), "r"(t
->fn
)
190 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
191 "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
192 "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5",
193 "ymm6", "ymm7", "ymm8", "ymm9", "ymm10", "ymm11",
194 "ymm12", "ymm13", "ymm14", "ymm15"
196 compare_state(init
, &result
);
199 #define TEST(n, cmd, type) \
200 static void __attribute__((naked)) test_##n(void) \
203 asm volatile("ret"); \
205 #include "test-avx.h"
208 static const TestDef test_table
[] = {
209 #define TEST(n, cmd, type) {n, test_##n, cmd, &init##type},
210 #include "test-avx.h"
214 static void run_all(void)
217 for (t
= test_table
; t
->fn
; t
++) {
222 #define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0]))
224 float val_f32
[] = {2.0, -1.0, 4.8, 0.8, 3, -42.0, 5e6
, 7.5, 8.3};
225 double val_f64
[] = {2.0, -1.0, 4.8, 0.8, 3, -42.0, 5e6
, 7.5};
227 {0x3d6b3b6a9e4118f2lu
, 0x355ae76d2774d78clu
,
228 0xac3ff76c4daa4b28lu
, 0xe7fabd204cb54083lu
},
229 {0xd851c54a56bf1f29lu
, 0x4a84d1d50bf4c4fflu
,
230 0x56621e553d52b56clu
, 0xd0069553da8f584alu
},
231 {0x5826475e2c5fd799lu
, 0xfd32edc01243f5e9lu
,
232 0x738ba2c66d3fe126lu
, 0x5707219c6e6c26b4lu
},
235 v4di deadbeef
= {0xa5a5a5a5deadbeefull
, 0xa5a5a5a5deadbeefull
,
236 0xa5a5a5a5deadbeefull
, 0xa5a5a5a5deadbeefull
};
237 v4di indexq
= {0x000000000000001full
, 0x000000000000008full
,
238 0xffffffffffffffffull
, 0xffffffffffffff5full
};
239 v4di indexd
= {0x00000002000000efull
, 0xfffffff500000010ull
,
240 0x0000000afffffff0ull
, 0x000000000000000eull
};
242 v4di gather_mem
[0x20];
244 void init_f32reg(v4di
*r
)
249 for (i
= 0; i
< 8; i
++) {
251 if (n
== ARRAY_LEN(val_f32
)) {
255 memcpy(r
, v
, sizeof(*r
));
258 void init_f64reg(v4di
*r
)
263 for (i
= 0; i
< 4; i
++) {
265 if (n
== ARRAY_LEN(val_f64
)) {
269 memcpy(r
, v
, sizeof(*r
));
272 void init_intreg(v4di
*r
)
274 static uint64_t mask
;
277 r
->q0
= val_i64
[n
].q0
^ mask
;
278 r
->q1
= val_i64
[n
].q1
^ mask
;
279 r
->q2
= val_i64
[n
].q2
^ mask
;
280 r
->q3
= val_i64
[n
].q3
^ mask
;
282 if (n
== ARRAY_LEN(val_i64
)) {
288 static void init_all(reg_state
*s
)
292 s
->r
[3] = (uint64_t)&s
->mem
[0]; /* rdx */
293 s
->r
[4] = (uint64_t)&gather_mem
[ARRAY_LEN(gather_mem
) / 2]; /* rsi */
294 s
->r
[5] = (uint64_t)&s
->mem
[2]; /* rdi */
296 for (i
= 0; i
< 16; i
++) {
297 s
->ymm
[i
] = deadbeef
;
301 for (i
= 0; i
< 4; i
++) {
302 s
->mem0
[i
] = deadbeef
;
306 int main(int argc
, char *argv
[])
311 init_intreg(&initI
.ymm
[10]);
312 init_intreg(&initI
.ymm
[11]);
313 init_intreg(&initI
.ymm
[12]);
314 init_intreg(&initI
.mem0
[1]);
319 init_f32reg(&initF32
.ymm
[10]);
320 init_f32reg(&initF32
.ymm
[11]);
321 init_f32reg(&initF32
.ymm
[12]);
322 init_f32reg(&initF32
.mem0
[1]);
328 init_f64reg(&initF64
.ymm
[10]);
329 init_f64reg(&initF64
.ymm
[11]);
330 init_f64reg(&initF64
.ymm
[12]);
331 init_f64reg(&initF64
.mem0
[1]);
336 for (i
= 0; i
< ARRAY_LEN(gather_mem
); i
++) {
337 init_intreg(&gather_mem
[i
]);
341 int n
= atoi(argv
[1]);
342 run_test(&test_table
[n
]);