main: hook up cpu detection code
[pulseaudio-mirror.git] / src / pulsecore / svolume_sse.c
blobb60471a72b317f751b0675386deca69e690c8ab8
1 /***
2 This file is part of PulseAudio.
4 Copyright 2004-2006 Lennart Poettering
5 Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk>
7 PulseAudio is free software; you can redistribute it and/or modify
8 it under the terms of the GNU Lesser General Public License as published
9 by the Free Software Foundation; either version 2.1 of the License,
10 or (at your option) any later version.
12 PulseAudio is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with PulseAudio; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 USA.
21 ***/
23 #ifdef HAVE_CONFIG_H
24 #include <config.h>
25 #endif
27 #include <alloca.h>
29 #include <pulsecore/random.h>
30 #include <pulsecore/macro.h>
31 #include <pulsecore/g711.h>
32 #include <pulsecore/core-util.h>
34 #include "cpu-x86.h"
36 #include "sample-util.h"
37 #include "endianmacros.h"
39 #if 0
40 static void
41 pa_volume_u8_sse (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
43 unsigned channel;
45 for (channel = 0; length; length--) {
46 int32_t t, hi, lo;
48 hi = volumes[channel] >> 16;
49 lo = volumes[channel] & 0xFFFF;
51 t = (int32_t) *samples - 0x80;
52 t = ((t * lo) >> 16) + (t * hi);
53 t = PA_CLAMP_UNLIKELY(t, -0x80, 0x7F);
54 *samples++ = (uint8_t) (t + 0x80);
56 if (PA_UNLIKELY(++channel >= channels))
57 channel = 0;
61 static void
62 pa_volume_alaw_sse (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
64 unsigned channel;
66 for (channel = 0; length; length--) {
67 int32_t t, hi, lo;
69 hi = volumes[channel] >> 16;
70 lo = volumes[channel] & 0xFFFF;
72 t = (int32_t) st_alaw2linear16(*samples);
73 t = ((t * lo) >> 16) + (t * hi);
74 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
75 *samples++ = (uint8_t) st_13linear2alaw((int16_t) t >> 3);
77 if (PA_UNLIKELY(++channel >= channels))
78 channel = 0;
82 static void
83 pa_volume_ulaw_sse (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
85 unsigned channel;
87 for (channel = 0; length; length--) {
88 int32_t t, hi, lo;
90 hi = volumes[channel] >> 16;
91 lo = volumes[channel] & 0xFFFF;
93 t = (int32_t) st_ulaw2linear16(*samples);
94 t = ((t * lo) >> 16) + (t * hi);
95 t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
96 *samples++ = (uint8_t) st_14linear2ulaw((int16_t) t >> 2);
98 if (PA_UNLIKELY(++channel >= channels))
99 channel = 0;
102 #endif
104 #define VOLUME_32x16(s,v) /* .. | vh | vl | */ \
105 " pxor %%xmm4, %%xmm4 \n\t" /* .. | 0 | 0 | */ \
106 " punpcklwd %%xmm4, "#s" \n\t" /* .. | 0 | p0 | */ \
107 " pcmpgtw "#s", %%xmm4 \n\t" /* .. | 0 | s(p0) | */ \
108 " pand "#v", %%xmm4 \n\t" /* .. | 0 | (vl) | */ \
109 " movdqa "#s", %%xmm5 \n\t" \
110 " pmulhuw "#v", "#s" \n\t" /* .. | 0 | vl*p0 | */ \
111 " psubd %%xmm4, "#s" \n\t" /* .. | 0 | vl*p0 | + sign correct */ \
112 " psrld $16, "#v" \n\t" /* .. | p0 | 0 | */ \
113 " pmaddwd %%xmm5, "#v" \n\t" /* .. | p0 * vh | */ \
114 " paddd "#s", "#v" \n\t" /* .. | p0 * v0 | */ \
115 " packssdw "#v", "#v" \n\t" /* .. | p1*v1 | p0*v0 | */
117 #define MOD_ADD(a,b) \
118 " add "#a", %3 \n\t" /* channel += inc */ \
119 " mov %3, %4 \n\t" \
120 " sub "#b", %4 \n\t" /* tmp = channel - channels */ \
121 " cmp "#b", %3 \n\t" /* if (channel >= channels) */ \
122 " cmovae %4, %3 \n\t" /* channel = tmp */
124 /* swap 16 bits */
125 #define SWAP_16(s) \
126 " movdqa "#s", %%xmm4 \n\t" /* .. | h l | */ \
127 " psrlw $8, %%xmm4 \n\t" /* .. | 0 h | */ \
128 " psllw $8, "#s" \n\t" /* .. | l 0 | */ \
129 " por %%xmm4, "#s" \n\t" /* .. | l h | */
131 /* swap 2 registers 16 bits for better pairing */
132 #define SWAP_16_2(s1,s2) \
133 " movdqa "#s1", %%xmm4 \n\t" /* .. | h l | */ \
134 " movdqa "#s2", %%xmm5 \n\t" \
135 " psrlw $8, %%xmm4 \n\t" /* .. | 0 h | */ \
136 " psrlw $8, %%xmm5 \n\t" \
137 " psllw $8, "#s1" \n\t" /* .. | l 0 | */ \
138 " psllw $8, "#s2" \n\t" \
139 " por %%xmm4, "#s1" \n\t" /* .. | l h | */ \
140 " por %%xmm5, "#s2" \n\t"
142 static void
143 pa_volume_s16ne_sse (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
145 pa_reg_x86 channel, temp;
147 /* the max number of samples we process at a time, this is also the max amount
148 * we overread the volume array, which should have enough padding. */
149 channels = MAX (8, channels);
151 __asm__ __volatile__ (
152 " xor %3, %3 \n\t"
153 " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
155 " test $1, %2 \n\t" /* check for odd samples */
156 " je 2f \n\t"
158 " movd (%1, %3, 4), %%xmm0 \n\t" /* | v0h | v0l | */
159 " movw (%0), %4 \n\t" /* .. | p0 | */
160 " movd %4, %%xmm1 \n\t"
161 VOLUME_32x16 (%%xmm1, %%xmm0)
162 " movd %%xmm0, %4 \n\t" /* .. | p0*v0 | */
163 " movw %4, (%0) \n\t"
164 " add $2, %0 \n\t"
165 MOD_ADD ($1, %5)
167 "2: \n\t"
168 " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
169 " test $1, %2 \n\t"
170 " je 4f \n\t"
172 "3: \n\t" /* do samples in groups of 2 */
173 " movq (%1, %3, 4), %%xmm0 \n\t" /* | v1h | v1l | v0h | v0l | */
174 " movd (%0), %%xmm1 \n\t" /* .. | p1 | p0 | */
175 VOLUME_32x16 (%%xmm1, %%xmm0)
176 " movd %%xmm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
177 " add $4, %0 \n\t"
178 MOD_ADD ($2, %5)
180 "4: \n\t"
181 " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
182 " test $1, %2 \n\t"
183 " je 6f \n\t"
185 "5: \n\t" /* do samples in groups of 4 */
186 " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
187 " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
188 VOLUME_32x16 (%%xmm1, %%xmm0)
189 " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
190 " add $8, %0 \n\t"
191 MOD_ADD ($4, %5)
193 "6: \n\t"
194 " sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */
195 " cmp $0, %2 \n\t"
196 " je 8f \n\t"
198 "7: \n\t" /* do samples in groups of 8 */
199 " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
200 " movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* | v7h | v7l .. v4h | v4l | */
201 " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
202 " movq 8(%0), %%xmm3 \n\t" /* .. | p7 .. p4 | */
203 VOLUME_32x16 (%%xmm1, %%xmm0)
204 VOLUME_32x16 (%%xmm3, %%xmm2)
205 " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
206 " movq %%xmm2, 8(%0) \n\t" /* .. | p7*v7 .. p4*v4 | */
207 " add $16, %0 \n\t"
208 MOD_ADD ($8, %5)
209 " dec %2 \n\t"
210 " jne 7b \n\t"
211 "8: \n\t"
213 : "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp)
214 : "r" ((pa_reg_x86)channels)
215 : "cc"
219 static void
220 pa_volume_s16re_sse (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
222 pa_reg_x86 channel, temp;
224 /* the max number of samples we process at a time, this is also the max amount
225 * we overread the volume array, which should have enough padding. */
226 channels = MAX (8, channels);
228 __asm__ __volatile__ (
229 " xor %3, %3 \n\t"
230 " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
232 " test $1, %2 \n\t" /* check for odd samples */
233 " je 2f \n\t"
235 " movd (%1, %3, 4), %%xmm0 \n\t" /* do odd sample */
236 " movw (%0), %4 \n\t"
237 " rorw $8, %4 \n\t"
238 " movd %4, %%xmm1 \n\t"
239 VOLUME_32x16 (%%xmm1, %%xmm0)
240 " movd %%xmm0, %4 \n\t"
241 " rorw $8, %4 \n\t"
242 " movw %4, (%0) \n\t"
243 " add $2, %0 \n\t"
244 MOD_ADD ($1, %5)
246 "2: \n\t"
247 " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
248 " test $1, %2 \n\t" /* check for odd samples */
249 " je 4f \n\t"
251 "3: \n\t" /* do samples in pairs of 2 */
252 " movq (%1, %3, 4), %%xmm0 \n\t" /* v1_h | v1_l | v0_h | v0_l */
253 " movd (%0), %%xmm1 \n\t" /* X | X | p1 | p0 */
254 SWAP_16 (%%xmm1)
255 VOLUME_32x16 (%%xmm1, %%xmm0)
256 SWAP_16 (%%xmm0)
257 " movd %%xmm0, (%0) \n\t"
258 " add $4, %0 \n\t"
259 MOD_ADD ($2, %5)
261 "4: \n\t"
262 " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
263 " test $1, %2 \n\t" /* check for odd samples */
264 " je 6f \n\t"
266 "5: \n\t" /* do samples in pairs of 4 */
267 " movdqu (%1, %3, 4), %%xmm0 \n\t" /* v1_h | v1_l | v0_h | v0_l */
268 " movq (%0), %%xmm1 \n\t" /* X | X | p1 | p0 */
269 SWAP_16 (%%xmm1)
270 VOLUME_32x16 (%%xmm1, %%xmm0)
271 SWAP_16 (%%xmm0)
272 " movq %%xmm0, (%0) \n\t"
273 " add $8, %0 \n\t"
274 MOD_ADD ($4, %5)
276 "6: \n\t"
277 " sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */
278 " cmp $0, %2 \n\t"
279 " je 8f \n\t"
281 "7: \n\t" /* do samples in pairs of 8 */
282 " movdqu (%1, %3, 4), %%xmm0 \n\t" /* v1_h | v1_l | v0_h | v0_l */
283 " movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* v3_h | v3_l | v2_h | v2_l */
284 " movq (%0), %%xmm1 \n\t" /* X | X | p1 | p0 */
285 " movq 8(%0), %%xmm3 \n\t" /* X | X | p3 | p2 */
286 SWAP_16_2 (%%xmm1, %%xmm3)
287 VOLUME_32x16 (%%xmm1, %%xmm0)
288 VOLUME_32x16 (%%xmm3, %%xmm2)
289 SWAP_16_2 (%%xmm0, %%xmm2)
290 " movq %%xmm0, (%0) \n\t"
291 " movq %%xmm2, 8(%0) \n\t"
292 " add $16, %0 \n\t"
293 MOD_ADD ($8, %5)
294 " dec %2 \n\t"
295 " jne 7b \n\t"
296 "8: \n\t"
298 : "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp)
299 : "r" ((pa_reg_x86)channels)
300 : "cc"
304 #if 0
305 static void
306 pa_volume_float32ne_sse (float *samples, float *volumes, unsigned channels, unsigned length)
308 unsigned channel;
310 length /= sizeof (float);
312 for (channel = 0; length; length--) {
313 *samples++ *= volumes[channel];
315 if (PA_UNLIKELY(++channel >= channels))
316 channel = 0;
320 static void
321 pa_volume_float32re_sse (float *samples, float *volumes, unsigned channels, unsigned length)
323 unsigned channel;
325 length /= sizeof (float);
327 for (channel = 0; length; length--) {
328 float t;
330 t = PA_FLOAT32_SWAP(*samples);
331 t *= volumes[channel];
332 *samples++ = PA_FLOAT32_SWAP(t);
334 if (PA_UNLIKELY(++channel >= channels))
335 channel = 0;
339 static void
340 pa_volume_s32ne_sse (int32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
342 unsigned channel;
344 length /= sizeof (int32_t);
346 for (channel = 0; length; length--) {
347 int64_t t;
349 t = (int64_t)(*samples);
350 t = (t * volumes[channel]) >> 16;
351 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
352 *samples++ = (int32_t) t;
354 if (PA_UNLIKELY(++channel >= channels))
355 channel = 0;
359 static void
360 pa_volume_s32re_sse (int32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
362 unsigned channel;
364 length /= sizeof (int32_t);
366 for (channel = 0; length; length--) {
367 int64_t t;
369 t = (int64_t) PA_INT32_SWAP(*samples);
370 t = (t * volumes[channel]) >> 16;
371 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
372 *samples++ = PA_INT32_SWAP((int32_t) t);
374 if (PA_UNLIKELY(++channel >= channels))
375 channel = 0;
379 static void
380 pa_volume_s24ne_sse (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
382 unsigned channel;
383 uint8_t *e;
385 e = samples + length;
387 for (channel = 0; samples < e; samples += 3) {
388 int64_t t;
390 t = (int64_t)((int32_t) (PA_READ24NE(samples) << 8));
391 t = (t * volumes[channel]) >> 16;
392 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
393 PA_WRITE24NE(samples, ((uint32_t) (int32_t) t) >> 8);
395 if (PA_UNLIKELY(++channel >= channels))
396 channel = 0;
400 static void
401 pa_volume_s24re_sse (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
403 unsigned channel;
404 uint8_t *e;
406 e = samples + length;
408 for (channel = 0; samples < e; samples += 3) {
409 int64_t t;
411 t = (int64_t)((int32_t) (PA_READ24RE(samples) << 8));
412 t = (t * volumes[channel]) >> 16;
413 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
414 PA_WRITE24RE(samples, ((uint32_t) (int32_t) t) >> 8);
416 if (PA_UNLIKELY(++channel >= channels))
417 channel = 0;
421 static void
422 pa_volume_s24_32ne_sse (uint32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
424 unsigned channel;
426 length /= sizeof (uint32_t);
428 for (channel = 0; length; length--) {
429 int64_t t;
431 t = (int64_t) ((int32_t) (*samples << 8));
432 t = (t * volumes[channel]) >> 16;
433 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
434 *samples++ = ((uint32_t) ((int32_t) t)) >> 8;
436 if (PA_UNLIKELY(++channel >= channels))
437 channel = 0;
441 static void
442 pa_volume_s24_32re_sse (uint32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
444 unsigned channel;
446 length /= sizeof (uint32_t);
448 for (channel = 0; length; length--) {
449 int64_t t;
451 t = (int64_t) ((int32_t) (PA_UINT32_SWAP(*samples) << 8));
452 t = (t * volumes[channel]) >> 16;
453 t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
454 *samples++ = PA_UINT32_SWAP(((uint32_t) ((int32_t) t)) >> 8);
456 if (PA_UNLIKELY(++channel >= channels))
457 channel = 0;
460 #endif
462 #undef RUN_TEST
464 #ifdef RUN_TEST
465 #define CHANNELS 2
466 #define SAMPLES 1021
467 #define TIMES 1000
468 #define PADDING 16
470 static void run_test (void) {
471 int16_t samples[SAMPLES];
472 int16_t samples_ref[SAMPLES];
473 int16_t samples_orig[SAMPLES];
474 int32_t volumes[CHANNELS + PADDING];
475 int i, j, padding;
476 pa_do_volume_func_t func;
478 func = pa_get_volume_func (PA_SAMPLE_S16RE);
480 printf ("checking SSE %d\n", sizeof (samples));
482 for (j = 0; j < TIMES; j++) {
483 pa_random (samples, sizeof (samples));
484 memcpy (samples_ref, samples, sizeof (samples));
485 memcpy (samples_orig, samples, sizeof (samples));
487 for (i = 0; i < CHANNELS; i++)
488 volumes[i] = rand() >> 1;
489 for (padding = 0; padding < PADDING; padding++, i++)
490 volumes[i] = volumes[padding];
492 pa_volume_s16re_sse (samples, volumes, CHANNELS, SAMPLES * sizeof (int16_t));
493 func (samples_ref, volumes, CHANNELS, SAMPLES * sizeof (int16_t));
495 for (i = 0; i < SAMPLES; i++) {
496 if (samples[i] != samples_ref[i]) {
497 printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
498 samples_orig[i], volumes[i % CHANNELS]);
503 #endif
505 void pa_volume_func_init_sse (pa_cpu_x86_flag_t flags) {
506 pa_log_info("Initialising SSE optimized functions.");
508 #ifdef RUN_TEST
509 run_test ();
510 #endif
512 pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_sse);
513 pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_sse);