4 * Copyright (C) Yuqing Deng <Yuqing_Deng@brown.edu> - April 2000
6 * 64 and 128 point split radix fft for ac3dec
8 * The algorithm is desribed in the book:
9 * "Computational Frameworks of the Fast Fourier Transform".
11 * The ideas and the the organization of code borrowed from djbfft written by
12 * D. J. Bernstein <djb@cr.py.to>. djbff can be found at
13 * http://cr.yp.to/djbfft.html.
15 * srfftp.h is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2, or (at your option)
20 * srfftp.h is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * GNU General Public License for more details.
25 * You should have received a copy of the GNU General Public License
26 * along with GNU Make; see the file COPYING. If not, write to
27 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
29 * Modified for using AMD's 3DNow! - 3DNowEx(DSP)! SIMD operations
30 * by Nick Kurshev <nickols_k@mail.ru>
33 #ifndef SRFFTP_3DNOW_H__
34 #define SRFFTP_3DNOW_H__
41 #define TRANS_FILL_MM6_MM7_3DNOW()\
42 __asm__ __volatile__(\
45 ::"m"(x_plus_minus_3dnow),\
46 "m"(x_minus_plus_3dnow)\
50 #define PSWAP_MM(mm_base,mm_hlp) "pswapd "mm_base","mm_base"\n\t"
52 #define PSWAP_MM(mm_base,mm_hlp)\
53 "movq "mm_base","mm_hlp"\n\t"\
54 "psrlq $32, "mm_base"\n\t"\
55 "punpckldq "mm_hlp","mm_base"\n\t"
58 #define PFNACC_MM(mm_base,mm_hlp) "pfnacc "mm_base","mm_base"\n\t"
60 #define PFNACC_MM(mm_base,mm_hlp)\
61 "movq "mm_base","mm_hlp"\n\t"\
62 "psrlq $32,"mm_hlp"\n\t"\
63 "punpckldq "mm_hlp","mm_hlp"\n\t"\
64 "pfsub "mm_hlp","mm_base"\n\t"
67 #define TRANSZERO_3DNOW(A0,A4,A8,A12) \
69 __asm__ __volatile__(\
70 "movq %4, %%mm0\n\t" /* mm0 = wTB[0]*/\
71 "movq %5, %%mm1\n\t" /* mm1 = wTB[k*2]*/ \
72 "movq %%mm0, %%mm5\n\t"/*u.re = wTB[0].re + wTB[k*2].re;*/\
73 "pfadd %%mm1, %%mm5\n\t"/*u.im = wTB[0].im + wTB[k*2].im; mm5 = u*/\
74 "pxor %%mm6, %%mm0\n\t"/*mm0 = wTB[0].re | -wTB[0].im */\
75 "pxor %%mm7, %%mm1\n\t"/*mm1 = -wTB[k*2].re | wTB[k*2].im */\
76 "pfadd %%mm1, %%mm0\n\t"/*v.im = wTB[0].re - wTB[k*2].re;*/\
77 "movq %%mm0, %%mm4\n\t"/*v.re =-wTB[0].im + wTB[k*2].im;*/\
78 PSWAP_MM("%%mm4","%%mm2")/* mm4 = v*/\
79 "movq %6, %%mm0\n\t" /* a1 = A0;*/\
80 "movq %7, %%mm2\n\t" /* a1 = A4;*/\
81 "movq %%mm0, %%mm1\n\t"\
82 "movq %%mm2, %%mm3\n\t"\
83 "pfadd %%mm5, %%mm0\n\t" /*A0 = a1 + u;*/\
84 "pfadd %%mm4, %%mm2\n\t" /*A12 = a1 + v;*/\
86 "pfsub %%mm5, %%mm1\n\t" /*A1 = a1 - u;*/\
88 "pfsub %%mm4, %%mm3\n\t" /*A4 = a1 - v;*/\
91 :"=m"(A0), "=m"(A8), "=m"(A4), "=m"(A12)\
92 :"m"(wTB[0]), "m"(wTB[k*2]), "m"(A0), "m"(A4)\
96 #define TRANSHALF_16_3DNOW(A2,A6,A10,A14)\
98 __asm__ __volatile__(\
99 "movq %4, %%mm0\n\t"/*u.re = wTB[2].im + wTB[2].re;*/\
100 "movq %%mm0, %%mm1\n\t"\
101 "pxor %%mm7, %%mm1\n\t"\
102 "pfacc %%mm1, %%mm0\n\t"/*u.im = wTB[2].im - wTB[2].re; mm0 = u*/\
103 "movq %5, %%mm1\n\t" /*a.re = wTB[6].im - wTB[6].re; */\
104 "movq %%mm1, %%mm2\n\t"\
105 "pxor %%mm7, %%mm1\n\t"\
106 "pfacc %%mm2, %%mm1\n\t"/*a.im = wTB[6].im + wTB[6].re; mm1 = a*/\
107 "movq %%mm1, %%mm2\n\t"\
108 "pxor %%mm7, %%mm2\n\t"/*v.im = u.re - a.re;*/\
109 "movq %%mm0, %%mm3\n\t"/*v.re = u.im + a.im;*/\
110 "pfadd %%mm2, %%mm3\n\t"\
111 PSWAP_MM("%%mm3","%%mm2")/*mm3 = v*/\
112 "pxor %%mm6, %%mm1\n\t"/*u.re = u.re + a.re;*/\
113 "pfadd %%mm1, %%mm0\n\t"/*u.im = u.im - a.im; mm0 = u*/\
114 "movq %8, %%mm2\n\t"\
115 "pfmul %%mm2, %%mm3\n\t" /* v *= HSQRT2_3DNOW; */\
116 "pfmul %%mm2, %%mm0\n\t" /* u *= HSQRT2_3DNOW; */\
117 "movq %6, %%mm1\n\t" /* a1 = A2;*/\
118 "movq %7, %%mm5\n\t" /* a1 = A6;*/\
119 "movq %%mm1, %%mm2\n\t"\
120 "movq %%mm3, %%mm4\n\t"\
121 "pfadd %%mm0, %%mm1\n\t" /*A2 = a1 + u;*/\
122 "pxor %%mm6, %%mm4\n\t"/*A6.re = a1.re + v.re;*/\
123 "pfsub %%mm0, %%mm2\n\t" /*A2 = a1 - u;*/\
124 "pxor %%mm7, %%mm3\n\t"/*A14.re = a1.re - v.re;*/\
125 "movq %%mm1, %0\n\t"\
126 "movq %%mm2, %1\n\t"\
127 "movq %%mm5, %%mm2\n\t"\
128 "pfadd %%mm4, %%mm5\n\t"/*A6.im = a1.im - v.im;*/\
129 "pfadd %%mm3, %%mm2\n\t"/*A14.im = a1.im + v.im;*/\
130 "movq %%mm5, %2\n\t"\
132 :"=m"(A2), "=m"(A10), "=m"(A6), "=m"(A14)\
133 :"m"(wTB[2]), "m"(wTB[6]), "m"(A2), "m"(A6), "m"(HSQRT2_3DNOW)\
137 #define TRANS_3DNOW(A1,A5,A9,A13,WT,WB,D,D3)\
139 __asm__ __volatile__(\
140 "movq %1, %%mm4\n\t"\
141 "movq %%mm4, %%mm5\n\t"\
142 "punpckldq %%mm4, %%mm4\n\t"/*mm4 = D.re | D.re */\
143 "punpckhdq %%mm5, %%mm5\n\t"/*mm5 = D.im | D.im */\
144 "movq %0, %%mm0\n\t"\
145 "pfmul %%mm0, %%mm4\n\t"/* mm4 =u.re | u.im */\
146 "pfmul %%mm0, %%mm5\n\t"/* mm5 = a.re | a.im */\
147 PSWAP_MM("%%mm5","%%mm3")\
148 "pxor %%mm7, %%mm5\n\t"\
149 "pfadd %%mm5, %%mm4\n\t"/* mm4 = u*/\
150 "movq %3, %%mm1\n\t"\
151 "movq %2, %%mm0\n\t"\
152 PSWAP_MM("%%mm1","%%mm3")\
153 "movq %%mm0, %%mm2\n\t"\
154 "pfmul %%mm1, %%mm0\n\t"/* mm0 = a*/\
155 "pfmul %3, %%mm2\n\t"/* mm2 = v*/\
156 PFNACC_MM("%%mm2","%%mm3")\
157 "pfacc %%mm0, %%mm0\n\t"\
158 "movq %%mm4, %%mm5\n\t"\
159 "punpckldq %%mm0,%%mm2\n\t"/*mm2 = v.re | a.re*/\
160 "pxor %%mm6, %%mm5\n\t"\
161 "movq %%mm2, %%mm3\n\t"\
162 "pxor %%mm7, %%mm3\n\t"\
163 "pfadd %%mm3, %%mm5\n\t"\
164 PSWAP_MM("%%mm5","%%mm3")/* mm5 = v*/\
165 "pfadd %%mm2, %%mm4\n\t"\
167 :"m"(WT), "m"(D), "m"(WB), "m"(D3)\
169 __asm__ __volatile__(\
170 "movq %4, %%mm0\n\t"/* a1 = A1*/\
171 "movq %5, %%mm2\n\t"/* a1 = A5*/\
172 "movq %%mm0, %%mm1\n\t"\
173 "movq %%mm2, %%mm3\n\t"\
174 "pfadd %%mm4, %%mm0\n\t"/*A1 = a1 + u*/\
175 "pfsub %%mm5, %%mm2\n\t"/*A5 = a1 - v*/\
176 "movq %%mm0, %0\n\t"\
177 "pfsub %%mm4, %%mm1\n\t"/*A9 = a1 - u*/\
178 "movq %%mm2, %2\n\t"\
179 "pfadd %%mm5, %%mm3\n\t"/*A9 = a1 + v*/\
180 "movq %%mm1, %1\n\t"\
182 :"=m"(A1), "=m"(A9), "=m"(A5), "=m"(A13)\