1 /* -*- mode: c; c-file-style: "bsd"; -*- */
3 Copyright (C) 2005-2008 Jussi Laako
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as published by
7 the Free Software Foundation; either version 2.1 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
23 #include <jack/intsimd.h>
35 asm volatile ("pushq %%rbx\n\t" : : : "memory");
37 asm volatile ("pushl %%ebx\n\t" : : : "memory");
40 "movl $0x80000000, %%eax\n\t" \
42 "cmpl $0x80000001, %%eax\n\t" \
43 "jl tdnow_prexit\n\t" \
45 "movl $0x80000001, %%eax\n\t" \
48 "xorl %%eax, %%eax\n\t" \
50 "movl $1, %%ecx\n\t" \
51 "shll $31, %%ecx\n\t" \
52 "testl %%ecx, %%edx\n\t" \
53 "jz tdnow_testexit\n\t" \
54 "movl $1, %%eax\n\t" \
56 "movl $1, %%ecx\n\t" \
57 "shll $30, %%ecx\n\t" \
58 "testl %%ecx, %%edx\n\t" \
59 "jz tdnow_testexit\n\t" \
60 "movl $2, %%eax\n\t" \
61 "jmp tdnow_testexit\n\t" \
64 "xorl %%eax, %%eax\n\t" \
68 : "ecx", "edx", "memory");
70 asm volatile ("popq %%rbx\n\t" : : : "memory");
72 asm volatile ("popl %%ebx\n\t" : : : "memory");
83 asm volatile ("pushq %%rbx\n\t" : : : "memory");
85 asm volatile ("pushl %%ebx\n\t" : : : "memory");
88 "movl $1, %%eax\n\t" \
91 "xorl %%eax, %%eax\n\t" \
93 "movl $1, %%ebx\n\t" \
94 "shll $25, %%ebx\n\t" \
95 "testl %%ebx, %%edx\n\t" \
96 "jz sse_testexit\n\t" \
97 "movl $1, %%eax\n\t" \
99 "movl $1, %%ebx\n\t" \
100 "shll $26, %%ebx\n\t" \
101 "testl %%ebx, %%edx\n\t" \
102 "jz sse_testexit\n\t" \
103 "movl $2, %%eax\n\t" \
105 "movl $1, %%ebx\n\t" \
106 "testl %%ebx, %%ecx\n\t" \
107 "jz sse_testexit\n\t" \
108 "movl $3, %%eax\n\t" \
113 : "ecx", "edx", "memory");
115 asm volatile ("popq %%rbx\n\t" : : : "memory");
117 asm volatile ("popl %%ebx\n\t" : : : "memory");
123 x86_3dnow_copyf (float *dest
, const float *src
, int length
)
126 pv2sf m64p_src
= (pv2sf
) src
;
127 pv2sf m64p_dest
= (pv2sf
) dest
;
130 n2
= ((length
& 0xf) >> 1);
131 for (i
= 0; i
< n1
; i
++)
133 asm volatile ("movq %0, %%mm0\n\t"
134 : : "m" (*m64p_src
++) : "mm0", "memory");
135 asm volatile ("movq %0, %%mm1\n\t"
136 : : "m" (*m64p_src
++) : "mm1", "memory");
137 asm volatile ("movq %0, %%mm2\n\t"
138 : : "m" (*m64p_src
++) : "mm2", "memory");
139 asm volatile ("movq %0, %%mm3\n\t"
140 : : "m" (*m64p_src
++) : "mm3", "memory");
141 asm volatile ("movq %0, %%mm4\n\t"
142 : : "m" (*m64p_src
++) : "mm4", "memory");
143 asm volatile ("movq %0, %%mm5\n\t"
144 : : "m" (*m64p_src
++) : "mm5", "memory");
145 asm volatile ("movq %0, %%mm6\n\t"
146 : : "m" (*m64p_src
++) : "mm6", "memory");
147 asm volatile ("movq %0, %%mm7\n\t"
148 : : "m" (*m64p_src
++) : "mm7", "memory");
150 asm volatile ("movq %%mm0, %0\n\t"
151 : "=m" (*m64p_dest
++) : : "mm0", "memory");
152 asm volatile ("movq %%mm1, %0\n\t"
153 : "=m" (*m64p_dest
++) : : "mm1", "memory");
154 asm volatile ("movq %%mm2, %0\n\t"
155 : "=m" (*m64p_dest
++) : : "mm2", "memory");
156 asm volatile ("movq %%mm3, %0\n\t"
157 : "=m" (*m64p_dest
++) : : "mm3", "memory");
158 asm volatile ("movq %%mm4, %0\n\t"
159 : "=m" (*m64p_dest
++) : : "mm4", "memory");
160 asm volatile ("movq %%mm5, %0\n\t"
161 : "=m" (*m64p_dest
++) : : "mm5", "memory");
162 asm volatile ("movq %%mm6, %0\n\t"
163 : "=m" (*m64p_dest
++) : : "mm6", "memory");
164 asm volatile ("movq %%mm7, %0\n\t"
165 : "=m" (*m64p_dest
++) : : "mm7", "memory");
167 for (i
= 0; i
< n2
; i
++)
170 "movq %1, %%mm0\n\t" \
172 : "=m" (*m64p_dest
++)
179 "movd %1, %%mm0\n\t" \
181 : "=m" (dest
[length
- 1])
182 : "m" (src
[length
- 1])
191 x86_3dnow_add2f (float *dest
, const float *src
, int length
)
194 pv2sf m64p_dest
= (pv2sf
) dest
;
195 pv2sf m64p_src
= (pv2sf
) src
;
198 for (i
= 0; i
< n
; i
++)
201 "movq %1, %%mm0\n\t" \
202 "pfadd %2, %%mm0\n\t" \
204 : "=m" (m64p_dest
[i
])
205 : "m0" (m64p_dest
[i
]),
212 "movd %1, %%mm0\n\t" \
213 "movd %2, %%mm1\n\t" \
214 "pfadd %%mm1, %%mm0\n\t" \
216 : "=m" (dest
[length
- 1])
217 : "m0" (dest
[length
- 1]),
218 "m" (src
[length
- 1])
219 : "mm0", "mm1", "memory");
227 x86_sse_copyf (float *dest
, const float *src
, int length
)
230 pv4sf m128p_src
= (pv4sf
) src
;
231 pv4sf m128p_dest
= (pv4sf
) dest
;
234 n2
= ((length
& 0x1f) >> 2);
235 si3
= (length
& ~0x3);
236 for (i
= 0; i
< n1
; i
++)
238 asm volatile ("movaps %0, %%xmm0\n\t"
239 : : "m" (*m128p_src
++) : "xmm0", "memory");
240 asm volatile ("movaps %0, %%xmm1\n\t"
241 : : "m" (*m128p_src
++) : "xmm1", "memory");
242 asm volatile ("movaps %0, %%xmm2\n\t"
243 : : "m" (*m128p_src
++) : "xmm2", "memory");
244 asm volatile ("movaps %0, %%xmm3\n\t"
245 : : "m" (*m128p_src
++) : "xmm3", "memory");
246 asm volatile ("movaps %0, %%xmm4\n\t"
247 : : "m" (*m128p_src
++) : "xmm4", "memory");
248 asm volatile ("movaps %0, %%xmm5\n\t"
249 : : "m" (*m128p_src
++) : "xmm5", "memory");
250 asm volatile ("movaps %0, %%xmm6\n\t"
251 : : "m" (*m128p_src
++) : "xmm6", "memory");
252 asm volatile ("movaps %0, %%xmm7\n\t"
253 : : "m" (*m128p_src
++) : "xmm7", "memory");
255 asm volatile ("movaps %%xmm0, %0\n\t"
256 : "=m" (*m128p_dest
++) : : "xmm0", "memory");
257 asm volatile ("movaps %%xmm1, %0\n\t"
258 : "=m" (*m128p_dest
++) : : "xmm1", "memory");
259 asm volatile ("movaps %%xmm2, %0\n\t"
260 : "=m" (*m128p_dest
++) : : "xmm2", "memory");
261 asm volatile ("movaps %%xmm3, %0\n\t"
262 : "=m" (*m128p_dest
++) : : "xmm3", "memory");
263 asm volatile ("movaps %%xmm4, %0\n\t"
264 : "=m" (*m128p_dest
++) : : "xmm4", "memory");
265 asm volatile ("movaps %%xmm5, %0\n\t"
266 : "=m" (*m128p_dest
++) : : "xmm5", "memory");
267 asm volatile ("movaps %%xmm6, %0\n\t"
268 : "=m" (*m128p_dest
++) : : "xmm6", "memory");
269 asm volatile ("movaps %%xmm7, %0\n\t"
270 : "=m" (*m128p_dest
++) : : "xmm7", "memory");
272 for (i
= 0; i
< n2
; i
++)
275 "movaps %1, %%xmm0\n\t" \
276 "movaps %%xmm0, %0\n\t"
277 : "=m" (*m128p_dest
++)
281 for (i
= si3
; i
< length
; i
++)
284 "movss %1, %%xmm0\n\t" \
285 "movss %%xmm0, %0\n\t"
293 x86_sse_add2f (float *dest
, const float *src
, int length
)
296 pv4sf m128p_src
= (pv4sf
) src
;
297 pv4sf m128p_dest
= (pv4sf
) dest
;
299 if (__builtin_expect(((long) src
& 0xf) || ((long) dest
& 0xf), 0))
301 /*jack_error("x86_sse_add2f(): non aligned pointers!");*/
305 si2
= (length
& ~0x3);
307 for (i
= 0; i
< n
; i
++)
310 "movaps %1, %%xmm0\n\t" \
311 "addps %2, %%xmm0\n\t" \
312 "movaps %%xmm0, %0\n\t"
313 : "=m" (m128p_dest
[i
])
314 : "m0" (m128p_dest
[i
]),
319 for (i
= si2
; i
< length
; i
++)
322 "movss %1, %%xmm0\n\t" \
323 "addss %2, %%xmm0\n\t" \
324 "movss %%xmm0, %0\n\t"
332 void x86_sse_f2i (int *dest
, const float *src
, int length
, float scale
)
335 static const float max
[4] __attribute__((aligned(16))) =
336 { -1.0F
, -1.0F
, -1.0F
, -1.0F
};
337 static const float min
[4] __attribute__((aligned(16))) =
338 { 1.0F
, 1.0F
, 1.0F
, 1.0F
};
339 float s
[4] __attribute__((aligned(16)));
341 s
[0] = s
[1] = s
[2] = s
[3] = scale
;
343 "movaps %0, %%xmm4\n\t" \
344 "movaps %1, %%xmm5\n\t" \
345 "movaps %2, %%xmm6\n\t"
350 : "xmm4", "xmm5", "xmm6");
352 if (__builtin_expect((((long) dest
& 0xf) || ((long) src
& 0xf)), 0))
354 for (i
= 0; i
< length
; i
+= 4)
357 "movaps %1, %%xmm1\n\t" \
358 "maxps %%xmm4, %%xmm1\n\t" \
359 "minps %%xmm5, %%xmm1\n\t" \
360 "mulps %%xmm6, %%xmm1\n\t" \
361 "cvtps2dq %%xmm1, %%xmm0\n\t" \
362 "movdqa %%xmm0, %0\n\t"
365 : "xmm0", "xmm1", "xmm4", "xmm5", "xmm6", "memory");
370 for (i
= 0; i
< length
; i
+= 4)
373 "movups %1, %%xmm1\n\t" \
374 "maxps %%xmm4, %%xmm1\n\t" \
375 "minps %%xmm5, %%xmm1\n\t" \
376 "mulps %%xmm6, %%xmm1\n\t" \
377 "cvtps2dq %%xmm1, %%xmm0\n\t" \
378 "movdqu %%xmm0, %0\n\t"
381 : "xmm0", "xmm1", "xmm4", "xmm5", "xmm6", "memory");
386 void x86_sse_i2f (float *dest
, const int *src
, int length
, float scale
)
389 float s
[4] __attribute__((aligned(16)));
391 s
[0] = s
[1] = s
[2] = s
[3] = scale
;
393 "movaps %0, %%xmm4\n\t"
398 if (__builtin_expect((((long) dest
& 0xf) || ((long) src
& 0xf)), 0))
400 for (i
= 0; i
< length
; i
+= 4)
403 "cvtdq2ps %1, %%xmm0\n\t" \
404 "mulps %%xmm4, %%xmm0\n\t" \
405 "movaps %%xmm0, %0\n\t"
408 : "xmm0", "xmm4", "memory");
413 for (i
= 0; i
< length
; i
+= 4)
416 "movdqu %1, %%xmm1\n\t" \
417 "cvtdq2ps %%xmm1, %%xmm0\n\t" \
418 "mulps %%xmm4, %%xmm0\n\t" \
419 "movups %%xmm0, %0\n\t"
422 : "xmm0", "xmm1", "xmm4", "memory");
426 #endif /* ARCH_X86 */
428 #endif /* USE_DYNSIMD */