Fix Polly
[polly-mirror.git] / docs / experiments / matmul / matmul.polly.interchanged+tiled+vector.s
blob194fdb144c9251f0f2bf8fd0e12dc306e7c54304
1 .text
2 .file "matmul.c"
3 .section .rodata.cst8,"aM",@progbits,8
4 .p2align 3 # -- Begin function init_array
5 .LCPI0_0:
6 .quad 4602678819172646912 # double 0.5
7 .text
8 .globl init_array
9 .p2align 4, 0x90
10 .type init_array,@function
11 init_array: # @init_array
12 .cfi_startproc
13 # %bb.0: # %entry
14 pushq %rbp
15 .cfi_def_cfa_offset 16
16 .cfi_offset %rbp, -16
17 movq %rsp, %rbp
18 .cfi_def_cfa_register %rbp
19 leaq B(%rip), %rax
20 leaq A(%rip), %rcx
21 xorl %r8d, %r8d
22 movsd .LCPI0_0(%rip), %xmm0 # xmm0 = mem[0],zero
23 xorl %r9d, %r9d
24 .p2align 4, 0x90
25 .LBB0_1: # %polly.loop_header
26 # =>This Loop Header: Depth=1
27 # Child Loop BB0_2 Depth 2
28 movl $1, %edi
29 xorl %edx, %edx
30 .p2align 4, 0x90
31 .LBB0_2: # %polly.loop_header1
32 # Parent Loop BB0_1 Depth=1
33 # => This Inner Loop Header: Depth=2
34 movl %edx, %esi
35 andl $1022, %esi # imm = 0x3FE
36 orl $1, %esi
37 xorps %xmm1, %xmm1
38 cvtsi2sdl %esi, %xmm1
39 mulsd %xmm0, %xmm1
40 cvtsd2ss %xmm1, %xmm1
41 movss %xmm1, -4(%rcx,%rdi,4)
42 movss %xmm1, -4(%rax,%rdi,4)
43 leal (%r9,%rdx), %esi
44 andl $1023, %esi # imm = 0x3FF
45 addl $1, %esi
46 xorps %xmm1, %xmm1
47 cvtsi2sdl %esi, %xmm1
48 mulsd %xmm0, %xmm1
49 cvtsd2ss %xmm1, %xmm1
50 movss %xmm1, (%rcx,%rdi,4)
51 movss %xmm1, (%rax,%rdi,4)
52 addq $2, %rdi
53 addl %r8d, %edx
54 cmpq $1537, %rdi # imm = 0x601
55 jne .LBB0_2
56 # %bb.3: # %polly.loop_exit3
57 # in Loop: Header=BB0_1 Depth=1
58 addq $1, %r9
59 addq $6144, %rax # imm = 0x1800
60 addq $6144, %rcx # imm = 0x1800
61 addl $2, %r8d
62 cmpq $1536, %r9 # imm = 0x600
63 jne .LBB0_1
64 # %bb.4: # %polly.exiting
65 popq %rbp
66 .cfi_def_cfa %rsp, 8
67 retq
68 .Lfunc_end0:
69 .size init_array, .Lfunc_end0-init_array
70 .cfi_endproc
71 # -- End function
72 .globl print_array # -- Begin function print_array
73 .p2align 4, 0x90
74 .type print_array,@function
75 print_array: # @print_array
76 .cfi_startproc
77 # %bb.0: # %entry
78 pushq %rbp
79 .cfi_def_cfa_offset 16
80 .cfi_offset %rbp, -16
81 movq %rsp, %rbp
82 .cfi_def_cfa_register %rbp
83 pushq %r15
84 pushq %r14
85 pushq %r13
86 pushq %r12
87 pushq %rbx
88 pushq %rax
89 .cfi_offset %rbx, -56
90 .cfi_offset %r12, -48
91 .cfi_offset %r13, -40
92 .cfi_offset %r14, -32
93 .cfi_offset %r15, -24
94 leaq C(%rip), %r13
95 xorl %eax, %eax
96 movl $3435973837, %r12d # imm = 0xCCCCCCCD
97 leaq .L.str(%rip), %r14
98 .p2align 4, 0x90
99 .LBB1_1: # %for.cond1.preheader
100 # =>This Loop Header: Depth=1
101 # Child Loop BB1_2 Depth 2
102 movq %rax, -48(%rbp) # 8-byte Spill
103 movq stdout(%rip), %rsi
104 xorl %ebx, %ebx
105 .p2align 4, 0x90
106 .LBB1_2: # %for.body3
107 # Parent Loop BB1_1 Depth=1
108 # => This Inner Loop Header: Depth=2
109 movl %ebx, %eax
110 imulq %r12, %rax
111 shrq $38, %rax
112 leal (%rax,%rax,4), %r15d
113 shll $4, %r15d
114 addl $79, %r15d
115 movss (%r13,%rbx,4), %xmm0 # xmm0 = mem[0],zero,zero,zero
116 cvtss2sd %xmm0, %xmm0
117 movb $1, %al
118 movq %rsi, %rdi
119 movq %r14, %rsi
120 callq fprintf
121 cmpl %ebx, %r15d
122 jne .LBB1_4
123 # %bb.3: # %if.then
124 # in Loop: Header=BB1_2 Depth=2
125 movq stdout(%rip), %rsi
126 movl $10, %edi
127 callq fputc@PLT
128 .LBB1_4: # %for.inc
129 # in Loop: Header=BB1_2 Depth=2
130 addq $1, %rbx
131 movq stdout(%rip), %rsi
132 cmpq $1536, %rbx # imm = 0x600
133 jne .LBB1_2
134 # %bb.5: # %for.end
135 # in Loop: Header=BB1_1 Depth=1
136 movl $10, %edi
137 callq fputc@PLT
138 movq -48(%rbp), %rax # 8-byte Reload
139 addq $1, %rax
140 addq $6144, %r13 # imm = 0x1800
141 cmpq $1536, %rax # imm = 0x600
142 jne .LBB1_1
143 # %bb.6: # %for.end12
144 addq $8, %rsp
145 popq %rbx
146 popq %r12
147 popq %r13
148 popq %r14
149 popq %r15
150 popq %rbp
151 .cfi_def_cfa %rsp, 8
152 retq
153 .Lfunc_end1:
154 .size print_array, .Lfunc_end1-print_array
155 .cfi_endproc
156 # -- End function
157 .globl main # -- Begin function main
158 .p2align 4, 0x90
159 .type main,@function
160 main: # @main
161 .cfi_startproc
162 # %bb.0: # %entry
163 pushq %rbp
164 .cfi_def_cfa_offset 16
165 .cfi_offset %rbp, -16
166 movq %rsp, %rbp
167 .cfi_def_cfa_register %rbp
168 pushq %r15
169 pushq %r14
170 pushq %r13
171 pushq %r12
172 pushq %rbx
173 subq $264, %rsp # imm = 0x108
174 .cfi_offset %rbx, -56
175 .cfi_offset %r12, -48
176 .cfi_offset %r13, -40
177 .cfi_offset %r14, -32
178 .cfi_offset %r15, -24
179 callq init_array
180 leaq C(%rip), %rdi
181 xorl %eax, %eax
182 movq %rax, -48(%rbp) # 8-byte Spill
183 xorl %esi, %esi
184 movl $9437184, %edx # imm = 0x900000
185 callq memset@PLT
186 movl $64, %eax
187 movq %rax, -80(%rbp) # 8-byte Spill
188 leaq A(%rip), %rax
189 movq %rax, -72(%rbp) # 8-byte Spill
190 .p2align 4, 0x90
191 .LBB2_1: # %polly.loop_header8
192 # =>This Loop Header: Depth=1
193 # Child Loop BB2_2 Depth 2
194 # Child Loop BB2_3 Depth 3
195 # Child Loop BB2_4 Depth 4
196 # Child Loop BB2_5 Depth 5
197 leaq B+192(%rip), %r9
198 xorl %edi, %edi
199 xorl %eax, %eax
200 .p2align 4, 0x90
201 .LBB2_2: # %polly.loop_header14
202 # Parent Loop BB2_1 Depth=1
203 # => This Loop Header: Depth=2
204 # Child Loop BB2_3 Depth 3
205 # Child Loop BB2_4 Depth 4
206 # Child Loop BB2_5 Depth 5
207 movq %rax, -168(%rbp) # 8-byte Spill
208 movq %rdi, -176(%rbp) # 8-byte Spill
209 shlq $6, %rdi
210 leaq 16(%rdi), %rdx
211 leaq 32(%rdi), %rsi
212 leaq 48(%rdi), %rcx
213 movq -72(%rbp), %r12 # 8-byte Reload
214 movq %r9, -184(%rbp) # 8-byte Spill
215 xorl %eax, %eax
216 .p2align 4, 0x90
217 .LBB2_3: # %polly.loop_header20
218 # Parent Loop BB2_1 Depth=1
219 # Parent Loop BB2_2 Depth=2
220 # => This Loop Header: Depth=3
221 # Child Loop BB2_4 Depth 4
222 # Child Loop BB2_5 Depth 5
223 movq %rax, -192(%rbp) # 8-byte Spill
224 movq %r12, -200(%rbp) # 8-byte Spill
225 movq -48(%rbp), %r14 # 8-byte Reload
226 .p2align 4, 0x90
227 .LBB2_4: # %polly.loop_header26
228 # Parent Loop BB2_1 Depth=1
229 # Parent Loop BB2_2 Depth=2
230 # Parent Loop BB2_3 Depth=3
231 # => This Loop Header: Depth=4
232 # Child Loop BB2_5 Depth 5
233 leaq (%r14,%r14,2), %rbx
234 shlq $11, %rbx
235 leaq C(%rip), %rax
236 addq %rax, %rbx
237 leaq (%rbx,%rdi,4), %r8
238 leaq (%rbx,%rdx,4), %r15
239 leaq (%rbx,%rsi,4), %r10
240 leaq (%rbx,%rcx,4), %r11
241 movups (%rbx,%rdi,4), %xmm8
242 movups 16(%rbx,%rdi,4), %xmm0
243 movaps %xmm0, -144(%rbp) # 16-byte Spill
244 movups 32(%rbx,%rdi,4), %xmm6
245 movups 48(%rbx,%rdi,4), %xmm1
246 movups (%rbx,%rdx,4), %xmm15
247 movups 16(%rbx,%rdx,4), %xmm0
248 movaps %xmm0, -64(%rbp) # 16-byte Spill
249 movups 32(%rbx,%rdx,4), %xmm0
250 movaps %xmm0, -96(%rbp) # 16-byte Spill
251 movups 48(%rbx,%rdx,4), %xmm0
252 movaps %xmm0, -112(%rbp) # 16-byte Spill
253 movups (%rbx,%rsi,4), %xmm11
254 movups 16(%rbx,%rsi,4), %xmm0
255 movaps %xmm0, -160(%rbp) # 16-byte Spill
256 movups 32(%rbx,%rsi,4), %xmm12
257 movups 48(%rbx,%rsi,4), %xmm0
258 movaps %xmm0, -128(%rbp) # 16-byte Spill
259 movups (%rbx,%rcx,4), %xmm9
260 movups 16(%rbx,%rcx,4), %xmm13
261 movups 32(%rbx,%rcx,4), %xmm2
262 movups 48(%rbx,%rcx,4), %xmm3
263 movq %r9, %rbx
264 movl $0, %r13d
265 .p2align 4, 0x90
266 .LBB2_5: # %vector.ph
267 # Parent Loop BB2_1 Depth=1
268 # Parent Loop BB2_2 Depth=2
269 # Parent Loop BB2_3 Depth=3
270 # Parent Loop BB2_4 Depth=4
271 # => This Inner Loop Header: Depth=5
272 movaps %xmm12, -240(%rbp) # 16-byte Spill
273 movaps %xmm2, -256(%rbp) # 16-byte Spill
274 movaps %xmm3, -272(%rbp) # 16-byte Spill
275 movaps %xmm8, %xmm10
276 movaps -144(%rbp), %xmm7 # 16-byte Reload
277 unpcklps %xmm7, %xmm10 # xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1]
278 movaps %xmm1, %xmm4
279 shufps $0, %xmm6, %xmm4 # xmm4 = xmm4[0,0],xmm6[0,0]
280 shufps $36, %xmm4, %xmm10 # xmm10 = xmm10[0,1],xmm4[2,0]
281 movaps %xmm7, %xmm5
282 shufps $17, %xmm8, %xmm5 # xmm5 = xmm5[1,0],xmm8[1,0]
283 movaps %xmm6, %xmm4
284 unpcklps %xmm1, %xmm4 # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
285 shufps $226, %xmm4, %xmm5 # xmm5 = xmm5[2,0],xmm4[2,3]
286 movaps %xmm8, %xmm12
287 unpckhps %xmm7, %xmm12 # xmm12 = xmm12[2],xmm7[2],xmm12[3],xmm7[3]
288 movaps %xmm1, %xmm4
289 shufps $34, %xmm6, %xmm4 # xmm4 = xmm4[2,0],xmm6[2,0]
290 shufps $36, %xmm4, %xmm12 # xmm12 = xmm12[0,1],xmm4[2,0]
291 shufps $51, %xmm8, %xmm7 # xmm7 = xmm7[3,0],xmm8[3,0]
292 unpckhps %xmm1, %xmm6 # xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3]
293 shufps $226, %xmm6, %xmm7 # xmm7 = xmm7[2,0],xmm6[2,3]
294 movaps -160(%rbx), %xmm0
295 movaps -144(%rbx), %xmm1
296 movaps %xmm1, %xmm6
297 shufps $0, %xmm0, %xmm6 # xmm6 = xmm6[0,0],xmm0[0,0]
298 movaps -192(%rbx), %xmm3
299 movaps -176(%rbx), %xmm4
300 movaps %xmm3, %xmm8
301 unpcklps %xmm4, %xmm8 # xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
302 shufps $36, %xmm6, %xmm8 # xmm8 = xmm8[0,1],xmm6[2,0]
303 movaps %xmm0, %xmm2
304 unpcklps %xmm1, %xmm2 # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
305 movaps %xmm4, %xmm6
306 shufps $17, %xmm3, %xmm6 # xmm6 = xmm6[1,0],xmm3[1,0]
307 shufps $226, %xmm2, %xmm6 # xmm6 = xmm6[2,0],xmm2[2,3]
308 movaps %xmm1, %xmm2
309 shufps $34, %xmm0, %xmm2 # xmm2 = xmm2[2,0],xmm0[2,0]
310 movaps %xmm3, %xmm14
311 unpckhps %xmm4, %xmm14 # xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3]
312 shufps $36, %xmm2, %xmm14 # xmm14 = xmm14[0,1],xmm2[2,0]
313 unpckhps %xmm1, %xmm0 # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
314 shufps $51, %xmm3, %xmm4 # xmm4 = xmm4[3,0],xmm3[3,0]
315 shufps $226, %xmm0, %xmm4 # xmm4 = xmm4[2,0],xmm0[2,3]
316 movss (%r12,%r13,4), %xmm0 # xmm0 = mem[0],zero,zero,zero
317 shufps $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0]
318 mulps %xmm0, %xmm8
319 addps %xmm10, %xmm8
320 mulps %xmm0, %xmm6
321 addps %xmm5, %xmm6
322 mulps %xmm0, %xmm14
323 addps %xmm12, %xmm14
324 mulps %xmm0, %xmm4
325 movaps %xmm0, %xmm5
326 addps %xmm7, %xmm4
327 movaps %xmm14, %xmm0
328 unpckhps %xmm4, %xmm0 # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
329 movaps %xmm6, %xmm1
330 shufps $51, %xmm8, %xmm1 # xmm1 = xmm1[3,0],xmm8[3,0]
331 shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3]
332 movaps %xmm1, -304(%rbp) # 16-byte Spill
333 movaps %xmm4, %xmm0
334 shufps $34, %xmm14, %xmm0 # xmm0 = xmm0[2,0],xmm14[2,0]
335 movaps %xmm8, %xmm1
336 unpckhps %xmm6, %xmm1 # xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
337 shufps $36, %xmm0, %xmm1 # xmm1 = xmm1[0,1],xmm0[2,0]
338 movaps %xmm1, -288(%rbp) # 16-byte Spill
339 movaps %xmm14, %xmm0
340 unpcklps %xmm4, %xmm0 # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
341 movaps %xmm6, %xmm1
342 shufps $17, %xmm8, %xmm1 # xmm1 = xmm1[1,0],xmm8[1,0]
343 shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3]
344 movaps %xmm1, -144(%rbp) # 16-byte Spill
345 shufps $0, %xmm14, %xmm4 # xmm4 = xmm4[0,0],xmm14[0,0]
346 unpcklps %xmm6, %xmm8 # xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
347 shufps $36, %xmm4, %xmm8 # xmm8 = xmm8[0,1],xmm4[2,0]
348 movaps %xmm15, %xmm14
349 movaps -64(%rbp), %xmm4 # 16-byte Reload
350 unpcklps %xmm4, %xmm14 # xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1]
351 movaps -112(%rbp), %xmm1 # 16-byte Reload
352 movaps %xmm1, %xmm0
353 movaps -96(%rbp), %xmm3 # 16-byte Reload
354 shufps $0, %xmm3, %xmm0 # xmm0 = xmm0[0,0],xmm3[0,0]
355 shufps $36, %xmm0, %xmm14 # xmm14 = xmm14[0,1],xmm0[2,0]
356 movaps %xmm4, %xmm12
357 shufps $17, %xmm15, %xmm12 # xmm12 = xmm12[1,0],xmm15[1,0]
358 movaps %xmm3, %xmm2
359 unpcklps %xmm1, %xmm2 # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
360 shufps $226, %xmm2, %xmm12 # xmm12 = xmm12[2,0],xmm2[2,3]
361 movaps %xmm15, %xmm7
362 unpckhps %xmm4, %xmm7 # xmm7 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
363 movaps %xmm1, %xmm2
364 shufps $34, %xmm3, %xmm2 # xmm2 = xmm2[2,0],xmm3[2,0]
365 shufps $36, %xmm2, %xmm7 # xmm7 = xmm7[0,1],xmm2[2,0]
366 shufps $51, %xmm15, %xmm4 # xmm4 = xmm4[3,0],xmm15[3,0]
367 unpckhps %xmm1, %xmm3 # xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
368 shufps $226, %xmm3, %xmm4 # xmm4 = xmm4[2,0],xmm3[2,3]
369 movaps %xmm4, -64(%rbp) # 16-byte Spill
370 movaps -96(%rbx), %xmm2
371 movaps -80(%rbx), %xmm1
372 movaps %xmm1, %xmm4
373 shufps $0, %xmm2, %xmm4 # xmm4 = xmm4[0,0],xmm2[0,0]
374 movaps -112(%rbx), %xmm10
375 movaps -128(%rbx), %xmm0
376 movaps %xmm0, %xmm15
377 unpcklps %xmm10, %xmm15 # xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1]
378 shufps $36, %xmm4, %xmm15 # xmm15 = xmm15[0,1],xmm4[2,0]
379 movaps %xmm2, %xmm4
380 unpcklps %xmm1, %xmm4 # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
381 movaps %xmm10, %xmm6
382 shufps $17, %xmm0, %xmm6 # xmm6 = xmm6[1,0],xmm0[1,0]
383 shufps $226, %xmm4, %xmm6 # xmm6 = xmm6[2,0],xmm4[2,3]
384 movaps %xmm1, %xmm3
385 shufps $34, %xmm2, %xmm3 # xmm3 = xmm3[2,0],xmm2[2,0]
386 movaps %xmm0, %xmm4
387 unpckhps %xmm10, %xmm4 # xmm4 = xmm4[2],xmm10[2],xmm4[3],xmm10[3]
388 shufps $36, %xmm3, %xmm4 # xmm4 = xmm4[0,1],xmm3[2,0]
389 unpckhps %xmm1, %xmm2 # xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
390 shufps $51, %xmm0, %xmm10 # xmm10 = xmm10[3,0],xmm0[3,0]
391 shufps $226, %xmm2, %xmm10 # xmm10 = xmm10[2,0],xmm2[2,3]
392 movaps %xmm5, -224(%rbp) # 16-byte Spill
393 mulps %xmm5, %xmm15
394 addps %xmm14, %xmm15
395 mulps %xmm5, %xmm6
396 addps %xmm12, %xmm6
397 mulps %xmm5, %xmm4
398 addps %xmm7, %xmm4
399 mulps %xmm5, %xmm10
400 addps -64(%rbp), %xmm10 # 16-byte Folded Reload
401 movaps %xmm4, %xmm0
402 unpckhps %xmm10, %xmm0 # xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3]
403 movaps %xmm6, %xmm1
404 shufps $51, %xmm15, %xmm1 # xmm1 = xmm1[3,0],xmm15[3,0]
405 shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3]
406 movaps %xmm1, -112(%rbp) # 16-byte Spill
407 movaps %xmm10, %xmm0
408 shufps $34, %xmm4, %xmm0 # xmm0 = xmm0[2,0],xmm4[2,0]
409 movaps %xmm15, %xmm1
410 unpckhps %xmm6, %xmm1 # xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
411 shufps $36, %xmm0, %xmm1 # xmm1 = xmm1[0,1],xmm0[2,0]
412 movaps %xmm1, -96(%rbp) # 16-byte Spill
413 movaps %xmm4, %xmm0
414 unpcklps %xmm10, %xmm0 # xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
415 movaps %xmm6, %xmm1
416 shufps $17, %xmm15, %xmm1 # xmm1 = xmm1[1,0],xmm15[1,0]
417 shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3]
418 movaps %xmm1, -64(%rbp) # 16-byte Spill
419 shufps $0, %xmm4, %xmm10 # xmm10 = xmm10[0,0],xmm4[0,0]
420 unpcklps %xmm6, %xmm15 # xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1]
421 shufps $36, %xmm10, %xmm15 # xmm15 = xmm15[0,1],xmm10[2,0]
422 movaps %xmm11, %xmm10
423 movaps -160(%rbp), %xmm14 # 16-byte Reload
424 unpcklps %xmm14, %xmm10 # xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1]
425 movaps -128(%rbp), %xmm2 # 16-byte Reload
426 movaps %xmm2, %xmm0
427 movaps -240(%rbp), %xmm3 # 16-byte Reload
428 shufps $0, %xmm3, %xmm0 # xmm0 = xmm0[0,0],xmm3[0,0]
429 shufps $36, %xmm0, %xmm10 # xmm10 = xmm10[0,1],xmm0[2,0]
430 movaps %xmm14, %xmm12
431 shufps $17, %xmm11, %xmm12 # xmm12 = xmm12[1,0],xmm11[1,0]
432 movaps %xmm3, %xmm0
433 unpcklps %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
434 shufps $226, %xmm0, %xmm12 # xmm12 = xmm12[2,0],xmm0[2,3]
435 movaps %xmm11, %xmm0
436 unpckhps %xmm14, %xmm0 # xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
437 movaps %xmm2, %xmm1
438 shufps $34, %xmm3, %xmm1 # xmm1 = xmm1[2,0],xmm3[2,0]
439 shufps $36, %xmm1, %xmm0 # xmm0 = xmm0[0,1],xmm1[2,0]
440 shufps $51, %xmm11, %xmm14 # xmm14 = xmm14[3,0],xmm11[3,0]
441 unpckhps %xmm2, %xmm3 # xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
442 shufps $226, %xmm3, %xmm14 # xmm14 = xmm14[2,0],xmm3[2,3]
443 movaps -32(%rbx), %xmm1
444 movaps -16(%rbx), %xmm2
445 movaps %xmm2, %xmm3
446 shufps $0, %xmm1, %xmm3 # xmm3 = xmm3[0,0],xmm1[0,0]
447 movaps -48(%rbx), %xmm4
448 movaps -64(%rbx), %xmm5
449 movaps %xmm5, %xmm11
450 unpcklps %xmm4, %xmm11 # xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1]
451 shufps $36, %xmm3, %xmm11 # xmm11 = xmm11[0,1],xmm3[2,0]
452 movaps %xmm1, %xmm3
453 unpcklps %xmm2, %xmm3 # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
454 movaps %xmm4, %xmm7
455 shufps $17, %xmm5, %xmm7 # xmm7 = xmm7[1,0],xmm5[1,0]
456 shufps $226, %xmm3, %xmm7 # xmm7 = xmm7[2,0],xmm3[2,3]
457 movaps %xmm2, %xmm3
458 shufps $34, %xmm1, %xmm3 # xmm3 = xmm3[2,0],xmm1[2,0]
459 movaps %xmm5, %xmm6
460 unpckhps %xmm4, %xmm6 # xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3]
461 shufps $36, %xmm3, %xmm6 # xmm6 = xmm6[0,1],xmm3[2,0]
462 unpckhps %xmm2, %xmm1 # xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
463 shufps $51, %xmm5, %xmm4 # xmm4 = xmm4[3,0],xmm5[3,0]
464 shufps $226, %xmm1, %xmm4 # xmm4 = xmm4[2,0],xmm1[2,3]
465 movaps -224(%rbp), %xmm1 # 16-byte Reload
466 mulps %xmm1, %xmm11
467 addps %xmm10, %xmm11
468 mulps %xmm1, %xmm7
469 addps %xmm12, %xmm7
470 mulps %xmm1, %xmm6
471 addps %xmm0, %xmm6
472 mulps %xmm1, %xmm4
473 addps %xmm14, %xmm4
474 movaps %xmm6, %xmm0
475 unpckhps %xmm4, %xmm0 # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
476 movaps %xmm7, %xmm1
477 shufps $51, %xmm11, %xmm1 # xmm1 = xmm1[3,0],xmm11[3,0]
478 shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3]
479 movaps %xmm1, -128(%rbp) # 16-byte Spill
480 movaps %xmm4, %xmm0
481 shufps $34, %xmm6, %xmm0 # xmm0 = xmm0[2,0],xmm6[2,0]
482 movaps %xmm11, %xmm12
483 unpckhps %xmm7, %xmm12 # xmm12 = xmm12[2],xmm7[2],xmm12[3],xmm7[3]
484 shufps $36, %xmm0, %xmm12 # xmm12 = xmm12[0,1],xmm0[2,0]
485 movaps %xmm6, %xmm0
486 unpcklps %xmm4, %xmm0 # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
487 movaps %xmm7, %xmm1
488 shufps $17, %xmm11, %xmm1 # xmm1 = xmm1[1,0],xmm11[1,0]
489 shufps $226, %xmm0, %xmm1 # xmm1 = xmm1[2,0],xmm0[2,3]
490 movaps %xmm1, -160(%rbp) # 16-byte Spill
491 shufps $0, %xmm6, %xmm4 # xmm4 = xmm4[0,0],xmm6[0,0]
492 unpcklps %xmm7, %xmm11 # xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1]
493 shufps $36, %xmm4, %xmm11 # xmm11 = xmm11[0,1],xmm4[2,0]
494 movaps %xmm9, %xmm10
495 unpcklps %xmm13, %xmm10 # xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
496 movaps -272(%rbp), %xmm2 # 16-byte Reload
497 movaps %xmm2, %xmm0
498 movaps -256(%rbp), %xmm3 # 16-byte Reload
499 shufps $0, %xmm3, %xmm0 # xmm0 = xmm0[0,0],xmm3[0,0]
500 shufps $36, %xmm0, %xmm10 # xmm10 = xmm10[0,1],xmm0[2,0]
501 movaps %xmm13, %xmm14
502 shufps $17, %xmm9, %xmm14 # xmm14 = xmm14[1,0],xmm9[1,0]
503 movaps %xmm3, %xmm0
504 unpcklps %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
505 shufps $226, %xmm0, %xmm14 # xmm14 = xmm14[2,0],xmm0[2,3]
506 movaps %xmm9, %xmm0
507 unpckhps %xmm13, %xmm0 # xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3]
508 movaps %xmm2, %xmm1
509 shufps $34, %xmm3, %xmm1 # xmm1 = xmm1[2,0],xmm3[2,0]
510 shufps $36, %xmm1, %xmm0 # xmm0 = xmm0[0,1],xmm1[2,0]
511 shufps $51, %xmm9, %xmm13 # xmm13 = xmm13[3,0],xmm9[3,0]
512 unpckhps %xmm2, %xmm3 # xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
513 shufps $226, %xmm3, %xmm13 # xmm13 = xmm13[2,0],xmm3[2,3]
514 movaps 32(%rbx), %xmm1
515 movaps 48(%rbx), %xmm2
516 movaps %xmm2, %xmm3
517 shufps $0, %xmm1, %xmm3 # xmm3 = xmm3[0,0],xmm1[0,0]
518 movaps 16(%rbx), %xmm4
519 movaps (%rbx), %xmm5
520 movaps %xmm5, %xmm9
521 unpcklps %xmm4, %xmm9 # xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
522 shufps $36, %xmm3, %xmm9 # xmm9 = xmm9[0,1],xmm3[2,0]
523 movaps %xmm1, %xmm3
524 unpcklps %xmm2, %xmm3 # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
525 movaps %xmm4, %xmm7
526 shufps $17, %xmm5, %xmm7 # xmm7 = xmm7[1,0],xmm5[1,0]
527 shufps $226, %xmm3, %xmm7 # xmm7 = xmm7[2,0],xmm3[2,3]
528 movaps %xmm2, %xmm3
529 shufps $34, %xmm1, %xmm3 # xmm3 = xmm3[2,0],xmm1[2,0]
530 movaps %xmm5, %xmm6
531 unpckhps %xmm4, %xmm6 # xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3]
532 shufps $36, %xmm3, %xmm6 # xmm6 = xmm6[0,1],xmm3[2,0]
533 unpckhps %xmm2, %xmm1 # xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
534 shufps $51, %xmm5, %xmm4 # xmm4 = xmm4[3,0],xmm5[3,0]
535 shufps $226, %xmm1, %xmm4 # xmm4 = xmm4[2,0],xmm1[2,3]
536 movaps -224(%rbp), %xmm1 # 16-byte Reload
537 mulps %xmm1, %xmm9
538 addps %xmm10, %xmm9
539 mulps %xmm1, %xmm7
540 addps %xmm14, %xmm7
541 mulps %xmm1, %xmm6
542 addps %xmm0, %xmm6
543 mulps %xmm1, %xmm4
544 addps %xmm13, %xmm4
545 movaps %xmm6, %xmm0
546 unpckhps %xmm4, %xmm0 # xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
547 movaps %xmm7, %xmm3
548 shufps $51, %xmm9, %xmm3 # xmm3 = xmm3[3,0],xmm9[3,0]
549 shufps $226, %xmm0, %xmm3 # xmm3 = xmm3[2,0],xmm0[2,3]
550 movaps %xmm4, %xmm0
551 shufps $34, %xmm6, %xmm0 # xmm0 = xmm0[2,0],xmm6[2,0]
552 movaps %xmm9, %xmm2
553 unpckhps %xmm7, %xmm2 # xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
554 shufps $36, %xmm0, %xmm2 # xmm2 = xmm2[0,1],xmm0[2,0]
555 movaps %xmm6, %xmm0
556 unpcklps %xmm4, %xmm0 # xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
557 movaps %xmm7, %xmm13
558 shufps $17, %xmm9, %xmm13 # xmm13 = xmm13[1,0],xmm9[1,0]
559 shufps $226, %xmm0, %xmm13 # xmm13 = xmm13[2,0],xmm0[2,3]
560 shufps $0, %xmm6, %xmm4 # xmm4 = xmm4[0,0],xmm6[0,0]
561 movaps -288(%rbp), %xmm6 # 16-byte Reload
562 movaps -304(%rbp), %xmm1 # 16-byte Reload
563 unpcklps %xmm7, %xmm9 # xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
564 shufps $36, %xmm4, %xmm9 # xmm9 = xmm9[0,1],xmm4[2,0]
565 addq $1, %r13
566 addq $6144, %rbx # imm = 0x1800
567 cmpq $64, %r13
568 jne .LBB2_5
569 # %bb.6: # %polly.loop_exit34
570 # in Loop: Header=BB2_4 Depth=4
571 movups %xmm8, (%r8)
572 movaps -144(%rbp), %xmm0 # 16-byte Reload
573 movups %xmm0, 16(%r8)
574 movups %xmm6, 32(%r8)
575 movups %xmm1, 48(%r8)
576 movaps -112(%rbp), %xmm0 # 16-byte Reload
577 movups %xmm0, 48(%r15)
578 movaps -96(%rbp), %xmm0 # 16-byte Reload
579 movups %xmm0, 32(%r15)
580 movaps -64(%rbp), %xmm0 # 16-byte Reload
581 movups %xmm0, 16(%r15)
582 movups %xmm15, (%r15)
583 movaps -128(%rbp), %xmm0 # 16-byte Reload
584 movups %xmm0, 48(%r10)
585 movaps -160(%rbp), %xmm0 # 16-byte Reload
586 movups %xmm0, 16(%r10)
587 movups %xmm11, (%r10)
588 movups %xmm12, 32(%r10)
589 movups %xmm3, 48(%r11)
590 movups %xmm13, 16(%r11)
591 movups %xmm9, (%r11)
592 movups %xmm2, 32(%r11)
593 addq $1, %r14
594 addq $6144, %r12 # imm = 0x1800
595 cmpq -80(%rbp), %r14 # 8-byte Folded Reload
596 jne .LBB2_4
597 # %bb.7: # %polly.loop_exit28
598 # in Loop: Header=BB2_3 Depth=3
599 movq -192(%rbp), %rax # 8-byte Reload
600 addq $64, %rax
601 addq $393216, %r9 # imm = 0x60000
602 movq -200(%rbp), %r12 # 8-byte Reload
603 addq $256, %r12 # imm = 0x100
604 cmpq $1536, %rax # imm = 0x600
605 jb .LBB2_3
606 # %bb.8: # %polly.loop_exit22
607 # in Loop: Header=BB2_2 Depth=2
608 movq -168(%rbp), %rax # 8-byte Reload
609 addq $64, %rax
610 movq -176(%rbp), %rdi # 8-byte Reload
611 addq $1, %rdi
612 movq -184(%rbp), %r9 # 8-byte Reload
613 addq $256, %r9 # imm = 0x100
614 cmpq $1536, %rax # imm = 0x600
615 jb .LBB2_2
616 # %bb.9: # %polly.loop_exit16
617 # in Loop: Header=BB2_1 Depth=1
618 movq -48(%rbp), %rax # 8-byte Reload
619 movq %rax, %rcx
620 addq $64, %rcx
621 addq $64, -80(%rbp) # 8-byte Folded Spill
622 addq $393216, -72(%rbp) # 8-byte Folded Spill
623 # imm = 0x60000
624 movq %rcx, %rax
625 movq %rcx, -48(%rbp) # 8-byte Spill
626 cmpq $1536, %rcx # imm = 0x600
627 jb .LBB2_1
628 # %bb.10: # %polly.exiting
629 xorl %eax, %eax
630 addq $264, %rsp # imm = 0x108
631 popq %rbx
632 popq %r12
633 popq %r13
634 popq %r14
635 popq %r15
636 popq %rbp
637 .cfi_def_cfa %rsp, 8
638 retq
639 .Lfunc_end2:
640 .size main, .Lfunc_end2-main
641 .cfi_endproc
642 # -- End function
643 .type A,@object # @A
644 .comm A,9437184,16
645 .type B,@object # @B
646 .comm B,9437184,16
647 .type .L.str,@object # @.str
648 .section .rodata.str1.1,"aMS",@progbits,1
649 .L.str:
650 .asciz "%lf "
651 .size .L.str, 5
653 .type C,@object # @C
654 .comm C,9437184,16
656 .ident "clang version 8.0.0 (trunk 342834) (llvm/trunk 342856)"
657 .section ".note.GNU-stack","",@progbits