Revert r19317, "simplify slice_end, return size of output frame".
[ffmpeg-lucabe.git] / libavcodec / x86 / fft_mmx.asm
blob1a9d1894cdfeae6253ff79037790b0ed65958698
1 ;******************************************************************************
2 ;* FFT transform with SSE/3DNow optimizations
3 ;* Copyright (c) 2008 Loren Merritt
4 ;*
5 ;* This file is part of FFmpeg.
6 ;*
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 ; These functions are not individually interchangeable with the C versions.
23 ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
24 ; in blocks as conventient to the vector size.
25 ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
27 %include "x86inc.asm"
29 SECTION_RODATA
31 %define M_SQRT1_2 0.70710678118654752440
32 ps_root2: times 4 dd M_SQRT1_2
33 ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
34 ps_m1p1: dd 1<<31, 0
36 %assign i 16
37 %rep 13
38 cextern ff_cos_ %+ i
39 %assign i i<<1
40 %endrep
42 %ifdef ARCH_X86_64
43 %define pointer dq
44 %else
45 %define pointer dd
46 %endif
48 %macro IF0 1+
49 %endmacro
50 %macro IF1 1+
52 %endmacro
54 section .text align=16
56 %macro T2_3DN 4 ; z0, z1, mem0, mem1
57 mova %1, %3
58 mova %2, %1
59 pfadd %1, %4
60 pfsub %2, %4
61 %endmacro
63 %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
64 mova %5, %3
65 pfsub %3, %4
66 pfadd %5, %4 ; {t6,t5}
67 pxor %3, [ps_m1p1 GLOBAL] ; {t8,t7}
68 mova %6, %1
69 pswapd %3, %3
70 pfadd %1, %5 ; {r0,i0}
71 pfsub %6, %5 ; {r2,i2}
72 mova %4, %2
73 pfadd %2, %3 ; {r1,i1}
74 pfsub %4, %3 ; {r3,i3}
75 SWAP %3, %6
76 %endmacro
78 ; in: %1={r0,i0,r1,i1} %2={r2,i2,r3,i3}
79 ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
80 %macro T4_SSE 3
81 mova %3, %1
82 shufps %1, %2, 0x64 ; {r0,i0,r3,i2}
83 shufps %3, %2, 0xce ; {r1,i1,r2,i3}
84 mova %2, %1
85 addps %1, %3 ; {t1,t2,t6,t5}
86 subps %2, %3 ; {t3,t4,t8,t7}
87 mova %3, %1
88 shufps %1, %2, 0x44 ; {t1,t2,t3,t4}
89 shufps %3, %2, 0xbe ; {t6,t5,t7,t8}
90 mova %2, %1
91 addps %1, %3 ; {r0,i0,r1,i1}
92 subps %2, %3 ; {r2,i2,r3,i3}
93 mova %3, %1
94 shufps %1, %2, 0x88 ; {r0,r1,r2,r3}
95 shufps %3, %2, 0xdd ; {i0,i1,i2,i3}
96 SWAP %2, %3
97 %endmacro
99 %macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1
100 mova %5, %3
101 shufps %3, %4, 0x44 ; {r4,i4,r6,i6}
102 shufps %5, %4, 0xee ; {r5,i5,r7,i7}
103 mova %6, %3
104 subps %3, %5 ; {r5,i5,r7,i7}
105 addps %6, %5 ; {t1,t2,t3,t4}
106 mova %5, %3
107 shufps %5, %5, 0xb1 ; {i5,r5,i7,r7}
108 mulps %3, [ps_root2mppm GLOBAL] ; {-r5,i5,r7,-i7}
109 mulps %5, [ps_root2 GLOBAL]
110 addps %3, %5 ; {t8,t7,ta,t9}
111 mova %5, %6
112 shufps %6, %3, 0x36 ; {t3,t2,t9,t8}
113 shufps %5, %3, 0x9c ; {t1,t4,t7,ta}
114 mova %3, %6
115 addps %6, %5 ; {t1,t2,t9,ta}
116 subps %3, %5 ; {t6,t5,tc,tb}
117 mova %5, %6
118 shufps %6, %3, 0xd8 ; {t1,t9,t5,tb}
119 shufps %5, %3, 0x8d ; {t2,ta,t6,tc}
120 mova %3, %1
121 mova %4, %2
122 addps %1, %6 ; {r0,r1,r2,r3}
123 addps %2, %5 ; {i0,i1,i2,i3}
124 subps %3, %6 ; {r4,r5,r6,r7}
125 subps %4, %5 ; {i4,i5,i6,i7}
126 %endmacro
128 ; scheduled for cpu-bound sizes
129 %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
130 IF%1 mova m4, Z(4)
131 IF%1 mova m5, Z(5)
132 mova m0, %2 ; wre
133 mova m2, m4
134 mova m1, %3 ; wim
135 mova m3, m5
136 mulps m2, m0 ; r2*wre
137 IF%1 mova m6, Z(6)
138 mulps m3, m1 ; i2*wim
139 IF%1 mova m7, Z(7)
140 mulps m4, m1 ; r2*wim
141 mulps m5, m0 ; i2*wre
142 addps m2, m3 ; r2*wre + i2*wim
143 mova m3, m1
144 mulps m1, m6 ; r3*wim
145 subps m5, m4 ; i2*wre - r2*wim
146 mova m4, m0
147 mulps m3, m7 ; i3*wim
148 mulps m4, m6 ; r3*wre
149 mulps m0, m7 ; i3*wre
150 subps m4, m3 ; r3*wre - i3*wim
151 mova m3, Z(0)
152 addps m0, m1 ; i3*wre + r3*wim
153 mova m1, m4
154 addps m4, m2 ; t5
155 subps m1, m2 ; t3
156 subps m3, m4 ; r2
157 addps m4, Z(0) ; r0
158 mova m6, Z(2)
159 mova Z(4), m3
160 mova Z(0), m4
161 mova m3, m5
162 subps m5, m0 ; t4
163 mova m4, m6
164 subps m6, m5 ; r3
165 addps m5, m4 ; r1
166 mova Z(6), m6
167 mova Z(2), m5
168 mova m2, Z(3)
169 addps m3, m0 ; t6
170 subps m2, m1 ; i3
171 mova m7, Z(1)
172 addps m1, Z(3) ; i1
173 mova Z(7), m2
174 mova Z(3), m1
175 mova m4, m7
176 subps m7, m3 ; i2
177 addps m3, m4 ; i0
178 mova Z(5), m7
179 mova Z(1), m3
180 %endmacro
182 ; scheduled to avoid store->load aliasing
183 %macro PASS_BIG 1 ; (!interleave)
184 mova m4, Z(4) ; r2
185 mova m5, Z(5) ; i2
186 mova m2, m4
187 mova m0, [wq] ; wre
188 mova m3, m5
189 mova m1, [wq+o1q] ; wim
190 mulps m2, m0 ; r2*wre
191 mova m6, Z(6) ; r3
192 mulps m3, m1 ; i2*wim
193 mova m7, Z(7) ; i3
194 mulps m4, m1 ; r2*wim
195 mulps m5, m0 ; i2*wre
196 addps m2, m3 ; r2*wre + i2*wim
197 mova m3, m1
198 mulps m1, m6 ; r3*wim
199 subps m5, m4 ; i2*wre - r2*wim
200 mova m4, m0
201 mulps m3, m7 ; i3*wim
202 mulps m4, m6 ; r3*wre
203 mulps m0, m7 ; i3*wre
204 subps m4, m3 ; r3*wre - i3*wim
205 mova m3, Z(0)
206 addps m0, m1 ; i3*wre + r3*wim
207 mova m1, m4
208 addps m4, m2 ; t5
209 subps m1, m2 ; t3
210 subps m3, m4 ; r2
211 addps m4, Z(0) ; r0
212 mova m6, Z(2)
213 mova Z(4), m3
214 mova Z(0), m4
215 mova m3, m5
216 subps m5, m0 ; t4
217 mova m4, m6
218 subps m6, m5 ; r3
219 addps m5, m4 ; r1
220 IF%1 mova Z(6), m6
221 IF%1 mova Z(2), m5
222 mova m2, Z(3)
223 addps m3, m0 ; t6
224 subps m2, m1 ; i3
225 mova m7, Z(1)
226 addps m1, Z(3) ; i1
227 IF%1 mova Z(7), m2
228 IF%1 mova Z(3), m1
229 mova m4, m7
230 subps m7, m3 ; i2
231 addps m3, m4 ; i0
232 IF%1 mova Z(5), m7
233 IF%1 mova Z(1), m3
234 %if %1==0
235 mova m4, m5 ; r1
236 mova m0, m6 ; r3
237 unpcklps m5, m1
238 unpckhps m4, m1
239 unpcklps m6, m2
240 unpckhps m0, m2
241 mova m1, Z(0)
242 mova m2, Z(4)
243 mova Z(2), m5
244 mova Z(3), m4
245 mova Z(6), m6
246 mova Z(7), m0
247 mova m5, m1 ; r0
248 mova m4, m2 ; r2
249 unpcklps m1, m3
250 unpckhps m5, m3
251 unpcklps m2, m7
252 unpckhps m4, m7
253 mova Z(0), m1
254 mova Z(1), m5
255 mova Z(4), m2
256 mova Z(5), m4
257 %endif
258 %endmacro
260 %macro PUNPCK 3
261 mova %3, %1
262 punpckldq %1, %2
263 punpckhdq %3, %2
264 %endmacro
266 INIT_XMM
268 %define Z(x) [r0+mmsize*x]
270 align 16
271 fft4_sse:
272 mova m0, Z(0)
273 mova m1, Z(1)
274 T4_SSE m0, m1, m2
275 mova Z(0), m0
276 mova Z(1), m1
279 align 16
280 fft8_sse:
281 mova m0, Z(0)
282 mova m1, Z(1)
283 T4_SSE m0, m1, m2
284 mova m2, Z(2)
285 mova m3, Z(3)
286 T8_SSE m0, m1, m2, m3, m4, m5
287 mova Z(0), m0
288 mova Z(1), m1
289 mova Z(2), m2
290 mova Z(3), m3
293 align 16
294 fft16_sse:
295 mova m0, Z(0)
296 mova m1, Z(1)
297 T4_SSE m0, m1, m2
298 mova m2, Z(2)
299 mova m3, Z(3)
300 T8_SSE m0, m1, m2, m3, m4, m5
301 mova m4, Z(4)
302 mova m5, Z(5)
303 mova Z(0), m0
304 mova Z(1), m1
305 mova Z(2), m2
306 mova Z(3), m3
307 T4_SSE m4, m5, m6
308 mova m6, Z(6)
309 mova m7, Z(7)
310 T4_SSE m6, m7, m0
311 PASS_SMALL 0, [ff_cos_16 GLOBAL], [ff_cos_16+16 GLOBAL]
315 INIT_MMX
317 %macro FFT48_3DN 1
318 align 16
319 fft4%1:
320 T2_3DN m0, m1, Z(0), Z(1)
321 mova m2, Z(2)
322 mova m3, Z(3)
323 T4_3DN m0, m1, m2, m3, m4, m5
324 PUNPCK m0, m1, m4
325 PUNPCK m2, m3, m5
326 mova Z(0), m0
327 mova Z(1), m4
328 mova Z(2), m2
329 mova Z(3), m5
332 align 16
333 fft8%1:
334 T2_3DN m0, m1, Z(0), Z(1)
335 mova m2, Z(2)
336 mova m3, Z(3)
337 T4_3DN m0, m1, m2, m3, m4, m5
338 mova Z(0), m0
339 mova Z(2), m2
340 T2_3DN m4, m5, Z(4), Z(5)
341 T2_3DN m6, m7, Z(6), Z(7)
342 pswapd m0, m5
343 pswapd m2, m7
344 pxor m0, [ps_m1p1 GLOBAL]
345 pxor m2, [ps_m1p1 GLOBAL]
346 pfsub m5, m0
347 pfadd m7, m2
348 pfmul m5, [ps_root2 GLOBAL]
349 pfmul m7, [ps_root2 GLOBAL]
350 T4_3DN m1, m3, m5, m7, m0, m2
351 mova Z(5), m5
352 mova Z(7), m7
353 mova m0, Z(0)
354 mova m2, Z(2)
355 T4_3DN m0, m2, m4, m6, m5, m7
356 PUNPCK m0, m1, m5
357 PUNPCK m2, m3, m7
358 mova Z(0), m0
359 mova Z(1), m5
360 mova Z(2), m2
361 mova Z(3), m7
362 PUNPCK m4, Z(5), m5
363 PUNPCK m6, Z(7), m7
364 mova Z(4), m4
365 mova Z(5), m5
366 mova Z(6), m6
367 mova Z(7), m7
369 %endmacro
371 FFT48_3DN _3dn2
373 %macro pswapd 2
374 %ifidn %1, %2
375 movd [r0+12], %1
376 punpckhdq %1, [r0+8]
377 %else
378 movq %1, %2
379 psrlq %1, 32
380 punpckldq %1, %2
381 %endif
382 %endmacro
384 FFT48_3DN _3dn
387 %define Z(x) [zq + o1q*(x&6)*((x/6)^1) + o3q*(x/6) + mmsize*(x&1)]
389 %macro DECL_PASS 2+ ; name, payload
390 align 16
392 DEFINE_ARGS z, w, n, o1, o3
393 lea o3q, [nq*3]
394 lea o1q, [nq*8]
395 shl o3q, 4
396 .loop:
398 add zq, mmsize*2
399 add wq, mmsize
400 sub nd, mmsize/8
401 jg .loop
402 rep ret
403 %endmacro
405 INIT_XMM
406 DECL_PASS pass_sse, PASS_BIG 1
407 DECL_PASS pass_interleave_sse, PASS_BIG 0
409 INIT_MMX
410 %define mulps pfmul
411 %define addps pfadd
412 %define subps pfsub
413 %define unpcklps punpckldq
414 %define unpckhps punpckhdq
415 DECL_PASS pass_3dn, PASS_SMALL 1, [wq], [wq+o1q]
416 DECL_PASS pass_interleave_3dn, PASS_BIG 0
417 %define pass_3dn2 pass_3dn
418 %define pass_interleave_3dn2 pass_interleave_3dn
421 %macro DECL_FFT 2-3 ; nbits, cpu, suffix
422 %xdefine list_of_fft fft4%2, fft8%2
423 %if %1==5
424 %xdefine list_of_fft list_of_fft, fft16%2
425 %endif
427 %assign n 1<<%1
428 %rep 17-%1
429 %assign n2 n/2
430 %assign n4 n/4
431 %xdefine list_of_fft list_of_fft, fft %+ n %+ %3%2
433 align 16
434 fft %+ n %+ %3%2:
435 call fft %+ n2 %+ %2
436 add r0, n*4 - (n&(-2<<%1))
437 call fft %+ n4 %+ %2
438 add r0, n*2 - (n2&(-2<<%1))
439 call fft %+ n4 %+ %2
440 sub r0, n*6 + (n2&(-2<<%1))
441 lea r1, [ff_cos_ %+ n GLOBAL]
442 mov r2d, n4/2
443 jmp pass%3%2
445 %assign n n*2
446 %endrep
447 %undef n
449 %ifidn __OUTPUT_FORMAT__,macho64
450 section .rodata
451 %endif
453 align 8
454 dispatch_tab%3%2: pointer list_of_fft
456 section .text
458 ; On x86_32, this function does the register saving and restoring for all of fft.
459 ; The others pass args in registers and don't spill anything.
460 cglobal fft_dispatch%3%2, 2,5,0, z, nbits
461 lea r2, [dispatch_tab%3%2 GLOBAL]
462 mov r2, [r2 + (nbitsq-2)*gprsize]
463 call r2
465 %endmacro ; DECL_FFT
467 DECL_FFT 5, _sse
468 DECL_FFT 5, _sse, _interleave
469 DECL_FFT 4, _3dn
470 DECL_FFT 4, _3dn, _interleave
471 DECL_FFT 4, _3dn2
472 DECL_FFT 4, _3dn2, _interleave