ensure that client-side message buffer thread calls thread_init callback if/when...
[jack.git] / libjack / simd.c
blobd1d1412e3bcdf5084f8ecc70e2eb2d5e1c1ed236
1 /* -*- mode: c; c-file-style: "bsd"; -*- */
2 /*
3 Copyright (C) 2005-2008 Jussi Laako
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU Lesser General Public License as published by
7 the Free Software Foundation; either version 2.1 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 #include <config.h>
23 #include <jack/intsimd.h>
25 #ifdef USE_DYNSIMD
27 #ifdef ARCH_X86
29 int
30 have_3dnow ()
32 unsigned int res = 0;
34 #ifdef __x86_64__
35 asm volatile ("pushq %%rbx\n\t" : : : "memory");
36 #else
37 asm volatile ("pushl %%ebx\n\t" : : : "memory");
38 #endif
39 asm volatile (
40 "movl $0x80000000, %%eax\n\t" \
41 "cpuid\n\t" \
42 "cmpl $0x80000001, %%eax\n\t" \
43 "jl tdnow_prexit\n\t" \
45 "movl $0x80000001, %%eax\n\t" \
46 "cpuid\n\t" \
48 "xorl %%eax, %%eax\n\t" \
50 "movl $1, %%ecx\n\t" \
51 "shll $31, %%ecx\n\t" \
52 "testl %%ecx, %%edx\n\t" \
53 "jz tdnow_testexit\n\t" \
54 "movl $1, %%eax\n\t" \
56 "movl $1, %%ecx\n\t" \
57 "shll $30, %%ecx\n\t" \
58 "testl %%ecx, %%edx\n\t" \
59 "jz tdnow_testexit\n\t" \
60 "movl $2, %%eax\n\t" \
61 "jmp tdnow_testexit\n\t" \
63 "tdnow_prexit:\n\t" \
64 "xorl %%eax, %%eax\n\t" \
65 "tdnow_testexit:\n\t"
66 : "=a" (res)
68 : "ecx", "edx", "memory");
69 #ifdef __x86_64__
70 asm volatile ("popq %%rbx\n\t" : : : "memory");
71 #else
72 asm volatile ("popl %%ebx\n\t" : : : "memory");
73 #endif
74 return res;
77 int
78 have_sse ()
80 unsigned int res = 0;
82 #ifdef __x86_64__
83 asm volatile ("pushq %%rbx\n\t" : : : "memory");
84 #else
85 asm volatile ("pushl %%ebx\n\t" : : : "memory");
86 #endif
87 asm volatile (
88 "movl $1, %%eax\n\t" \
89 "cpuid\n\t" \
91 "xorl %%eax, %%eax\n\t" \
93 "movl $1, %%ebx\n\t" \
94 "shll $25, %%ebx\n\t" \
95 "testl %%ebx, %%edx\n\t" \
96 "jz sse_testexit\n\t" \
97 "movl $1, %%eax\n\t" \
99 "movl $1, %%ebx\n\t" \
100 "shll $26, %%ebx\n\t" \
101 "testl %%ebx, %%edx\n\t" \
102 "jz sse_testexit\n\t" \
103 "movl $2, %%eax\n\t" \
105 "movl $1, %%ebx\n\t" \
106 "testl %%ebx, %%ecx\n\t" \
107 "jz sse_testexit\n\t" \
108 "movl $3, %%eax\n\t" \
110 "sse_testexit:\n\t"
111 : "=a" (res)
113 : "ecx", "edx", "memory");
114 #ifdef __x86_64__
115 asm volatile ("popq %%rbx\n\t" : : : "memory");
116 #else
117 asm volatile ("popl %%ebx\n\t" : : : "memory");
118 #endif
119 return res;
122 void
123 x86_3dnow_copyf (float *dest, const float *src, int length)
125 int i, n1, n2;
126 pv2sf m64p_src = (pv2sf) src;
127 pv2sf m64p_dest = (pv2sf) dest;
129 n1 = (length >> 4);
130 n2 = ((length & 0xf) >> 1);
131 for (i = 0; i < n1; i++)
133 asm volatile ("movq %0, %%mm0\n\t"
134 : : "m" (*m64p_src++) : "mm0", "memory");
135 asm volatile ("movq %0, %%mm1\n\t"
136 : : "m" (*m64p_src++) : "mm1", "memory");
137 asm volatile ("movq %0, %%mm2\n\t"
138 : : "m" (*m64p_src++) : "mm2", "memory");
139 asm volatile ("movq %0, %%mm3\n\t"
140 : : "m" (*m64p_src++) : "mm3", "memory");
141 asm volatile ("movq %0, %%mm4\n\t"
142 : : "m" (*m64p_src++) : "mm4", "memory");
143 asm volatile ("movq %0, %%mm5\n\t"
144 : : "m" (*m64p_src++) : "mm5", "memory");
145 asm volatile ("movq %0, %%mm6\n\t"
146 : : "m" (*m64p_src++) : "mm6", "memory");
147 asm volatile ("movq %0, %%mm7\n\t"
148 : : "m" (*m64p_src++) : "mm7", "memory");
150 asm volatile ("movq %%mm0, %0\n\t"
151 : "=m" (*m64p_dest++) : : "mm0", "memory");
152 asm volatile ("movq %%mm1, %0\n\t"
153 : "=m" (*m64p_dest++) : : "mm1", "memory");
154 asm volatile ("movq %%mm2, %0\n\t"
155 : "=m" (*m64p_dest++) : : "mm2", "memory");
156 asm volatile ("movq %%mm3, %0\n\t"
157 : "=m" (*m64p_dest++) : : "mm3", "memory");
158 asm volatile ("movq %%mm4, %0\n\t"
159 : "=m" (*m64p_dest++) : : "mm4", "memory");
160 asm volatile ("movq %%mm5, %0\n\t"
161 : "=m" (*m64p_dest++) : : "mm5", "memory");
162 asm volatile ("movq %%mm6, %0\n\t"
163 : "=m" (*m64p_dest++) : : "mm6", "memory");
164 asm volatile ("movq %%mm7, %0\n\t"
165 : "=m" (*m64p_dest++) : : "mm7", "memory");
167 for (i = 0; i < n2; i++)
169 asm volatile (
170 "movq %1, %%mm0\n\t" \
171 "movq %%mm0, %0\n\t"
172 : "=m" (*m64p_dest++)
173 : "m" (*m64p_src++)
174 : "mm0", "memory");
176 if (length & 0x1)
178 asm volatile (
179 "movd %1, %%mm0\n\t" \
180 "movd %%mm0, %0\n\t"
181 : "=m" (dest[length - 1])
182 : "m" (src[length - 1])
183 : "mm0", "memory");
185 asm volatile (
186 "femms\n\t" \
187 "sfence\n\t");
190 void
191 x86_3dnow_add2f (float *dest, const float *src, int length)
193 int i, n;
194 pv2sf m64p_dest = (pv2sf) dest;
195 pv2sf m64p_src = (pv2sf) src;
197 n = (length >> 1);
198 for (i = 0; i < n; i++)
200 asm volatile (
201 "movq %1, %%mm0\n\t" \
202 "pfadd %2, %%mm0\n\t" \
203 "movq %%mm0, %0\n\t"
204 : "=m" (m64p_dest[i])
205 : "m0" (m64p_dest[i]),
206 "m" (m64p_src[i])
207 : "mm0", "memory");
209 if (n & 0x1)
211 asm volatile (
212 "movd %1, %%mm0\n\t" \
213 "movd %2, %%mm1\n\t" \
214 "pfadd %%mm1, %%mm0\n\t" \
215 "movd %%mm0, %0\n\t"
216 : "=m" (dest[length - 1])
217 : "m0" (dest[length - 1]),
218 "m" (src[length - 1])
219 : "mm0", "mm1", "memory");
221 asm volatile (
222 "femms\n\t" \
223 "sfence\n\t");
226 void
227 x86_sse_copyf (float *dest, const float *src, int length)
229 int i, n1, n2, si3;
230 pv4sf m128p_src = (pv4sf) src;
231 pv4sf m128p_dest = (pv4sf) dest;
233 n1 = (length >> 5);
234 n2 = ((length & 0x1f) >> 2);
235 si3 = (length & ~0x3);
236 for (i = 0; i < n1; i++)
238 asm volatile ("movaps %0, %%xmm0\n\t"
239 : : "m" (*m128p_src++) : "xmm0", "memory");
240 asm volatile ("movaps %0, %%xmm1\n\t"
241 : : "m" (*m128p_src++) : "xmm1", "memory");
242 asm volatile ("movaps %0, %%xmm2\n\t"
243 : : "m" (*m128p_src++) : "xmm2", "memory");
244 asm volatile ("movaps %0, %%xmm3\n\t"
245 : : "m" (*m128p_src++) : "xmm3", "memory");
246 asm volatile ("movaps %0, %%xmm4\n\t"
247 : : "m" (*m128p_src++) : "xmm4", "memory");
248 asm volatile ("movaps %0, %%xmm5\n\t"
249 : : "m" (*m128p_src++) : "xmm5", "memory");
250 asm volatile ("movaps %0, %%xmm6\n\t"
251 : : "m" (*m128p_src++) : "xmm6", "memory");
252 asm volatile ("movaps %0, %%xmm7\n\t"
253 : : "m" (*m128p_src++) : "xmm7", "memory");
255 asm volatile ("movaps %%xmm0, %0\n\t"
256 : "=m" (*m128p_dest++) : : "xmm0", "memory");
257 asm volatile ("movaps %%xmm1, %0\n\t"
258 : "=m" (*m128p_dest++) : : "xmm1", "memory");
259 asm volatile ("movaps %%xmm2, %0\n\t"
260 : "=m" (*m128p_dest++) : : "xmm2", "memory");
261 asm volatile ("movaps %%xmm3, %0\n\t"
262 : "=m" (*m128p_dest++) : : "xmm3", "memory");
263 asm volatile ("movaps %%xmm4, %0\n\t"
264 : "=m" (*m128p_dest++) : : "xmm4", "memory");
265 asm volatile ("movaps %%xmm5, %0\n\t"
266 : "=m" (*m128p_dest++) : : "xmm5", "memory");
267 asm volatile ("movaps %%xmm6, %0\n\t"
268 : "=m" (*m128p_dest++) : : "xmm6", "memory");
269 asm volatile ("movaps %%xmm7, %0\n\t"
270 : "=m" (*m128p_dest++) : : "xmm7", "memory");
272 for (i = 0; i < n2; i++)
274 asm volatile (
275 "movaps %1, %%xmm0\n\t" \
276 "movaps %%xmm0, %0\n\t"
277 : "=m" (*m128p_dest++)
278 : "m" (*m128p_src++)
279 : "xmm0", "memory");
281 for (i = si3; i < length; i++)
283 asm volatile (
284 "movss %1, %%xmm0\n\t" \
285 "movss %%xmm0, %0\n\t"
286 : "=m" (dest[i])
287 : "m" (src[i])
288 : "xmm0", "memory");
292 void
293 x86_sse_add2f (float *dest, const float *src, int length)
295 int i, n, si2;
296 pv4sf m128p_src = (pv4sf) src;
297 pv4sf m128p_dest = (pv4sf) dest;
299 if (__builtin_expect(((long) src & 0xf) || ((long) dest & 0xf), 0))
301 /*jack_error("x86_sse_add2f(): non aligned pointers!");*/
302 si2 = 0;
303 goto sse_nonalign;
305 si2 = (length & ~0x3);
306 n = (length >> 2);
307 for (i = 0; i < n; i++)
309 asm volatile (
310 "movaps %1, %%xmm0\n\t" \
311 "addps %2, %%xmm0\n\t" \
312 "movaps %%xmm0, %0\n\t"
313 : "=m" (m128p_dest[i])
314 : "m0" (m128p_dest[i]),
315 "m" (m128p_src[i])
316 : "xmm0", "memory");
318 sse_nonalign:
319 for (i = si2; i < length; i++)
321 asm volatile (
322 "movss %1, %%xmm0\n\t" \
323 "addss %2, %%xmm0\n\t" \
324 "movss %%xmm0, %0\n\t"
325 : "=m" (dest[i])
326 : "m0" (dest[i]),
327 "m" (src[i])
328 : "xmm0", "memory");
332 void x86_sse_f2i (int *dest, const float *src, int length, float scale)
334 int i;
335 static const float max[4] __attribute__((aligned(16))) =
336 { -1.0F, -1.0F, -1.0F, -1.0F };
337 static const float min[4] __attribute__((aligned(16))) =
338 { 1.0F, 1.0F, 1.0F, 1.0F };
339 float s[4] __attribute__((aligned(16)));
341 s[0] = s[1] = s[2] = s[3] = scale;
342 asm volatile (
343 "movaps %0, %%xmm4\n\t" \
344 "movaps %1, %%xmm5\n\t" \
345 "movaps %2, %%xmm6\n\t"
347 : "m" (*max),
348 "m" (*min),
349 "m" (*s)
350 : "xmm4", "xmm5", "xmm6");
352 if (__builtin_expect((((long) dest & 0xf) || ((long) src & 0xf)), 0))
353 goto sse_nonalign;
354 for (i = 0; i < length; i += 4)
356 asm volatile (
357 "movaps %1, %%xmm1\n\t" \
358 "maxps %%xmm4, %%xmm1\n\t" \
359 "minps %%xmm5, %%xmm1\n\t" \
360 "mulps %%xmm6, %%xmm1\n\t" \
361 "cvtps2dq %%xmm1, %%xmm0\n\t" \
362 "movdqa %%xmm0, %0\n\t"
363 : "=m" (dest[i])
364 : "m" (src[i])
365 : "xmm0", "xmm1", "xmm4", "xmm5", "xmm6", "memory");
367 return;
369 sse_nonalign:
370 for (i = 0; i < length; i += 4)
372 asm volatile (
373 "movups %1, %%xmm1\n\t" \
374 "maxps %%xmm4, %%xmm1\n\t" \
375 "minps %%xmm5, %%xmm1\n\t" \
376 "mulps %%xmm6, %%xmm1\n\t" \
377 "cvtps2dq %%xmm1, %%xmm0\n\t" \
378 "movdqu %%xmm0, %0\n\t"
379 : "=m" (dest[i])
380 : "m" (src[i])
381 : "xmm0", "xmm1", "xmm4", "xmm5", "xmm6", "memory");
386 void x86_sse_i2f (float *dest, const int *src, int length, float scale)
388 int i;
389 float s[4] __attribute__((aligned(16)));
391 s[0] = s[1] = s[2] = s[3] = scale;
392 asm volatile (
393 "movaps %0, %%xmm4\n\t"
395 : "m" (*s)
396 : "xmm4" );
398 if (__builtin_expect((((long) dest & 0xf) || ((long) src & 0xf)), 0))
399 goto sse_nonalign;
400 for (i = 0; i < length; i += 4)
402 asm volatile (
403 "cvtdq2ps %1, %%xmm0\n\t" \
404 "mulps %%xmm4, %%xmm0\n\t" \
405 "movaps %%xmm0, %0\n\t"
406 : "=m" (dest[i])
407 : "m" (src[i])
408 : "xmm0", "xmm4", "memory");
410 return;
412 sse_nonalign:
413 for (i = 0; i < length; i += 4)
415 asm volatile (
416 "movdqu %1, %%xmm1\n\t" \
417 "cvtdq2ps %%xmm1, %%xmm0\n\t" \
418 "mulps %%xmm4, %%xmm0\n\t" \
419 "movups %%xmm0, %0\n\t"
420 : "=m" (dest[i])
421 : "m" (src[i])
422 : "xmm0", "xmm1", "xmm4", "memory");
426 #endif /* ARCH_X86 */
428 #endif /* USE_DYNSIMD */