src/pulsecore/sconv_sse.c

   1 /***
   2   This file is part of PulseAudio.
   3
   4   Copyright 2004-2006 Lennart Poettering
   5   Copyright 2006 Pierre Ossman <ossman@cendio.se> for Cendio AB
   6
   7   PulseAudio is free software; you can redistribute it and/or modify
   8   it under the terms of the GNU Lesser General Public License as published
   9   by the Free Software Foundation; either version 2.1 of the License,
  10   or (at your option) any later version.
  11
  12   PulseAudio is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with PulseAudio; if not, write to the Free Software
  19   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  20   USA.
  21 ***/
  22
  23 #ifdef HAVE_CONFIG_H
  24 #include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29
  30 #include <pulsecore/macro.h>
  31 #include <pulsecore/endianmacros.h>
  32
  33 #include "cpu-x86.h"
  34 #include "sconv.h"
  35
  36 #if !defined(__APPLE__) && defined (__i386__) || defined (__amd64__)
  37
  38 static const PA_DECLARE_ALIGNED (16, float, one[4]) = { 1.0, 1.0, 1.0, 1.0 };
  39 static const PA_DECLARE_ALIGNED (16, float, mone[4]) = { -1.0, -1.0, -1.0, -1.0 };
  40 static const PA_DECLARE_ALIGNED (16, float, scale[4]) = { 0x7fff, 0x7fff, 0x7fff, 0x7fff };
  41
  42 static void pa_sconv_s16le_from_f32ne_sse(unsigned n, const float *a, int16_t *b) {
  43     pa_reg_x86 temp, i;
  44
  45     __asm__ __volatile__ (
  46         " movaps %5, %%xmm5             \n\t"
  47         " movaps %6, %%xmm6             \n\t"
  48         " movaps %7, %%xmm7             \n\t"
  49         " xor %0, %0                    \n\t"
  50
  51         " mov %4, %1                    \n\t"
  52         " sar $3, %1                    \n\t" /* 8 floats at a time */
  53         " cmp $0, %1                    \n\t"
  54         " je 2f                         \n\t"
  55
  56         "1:                             \n\t"
  57         " movups (%2, %0, 2), %%xmm0    \n\t" /* read 8 floats */
  58         " movups 16(%2, %0, 2), %%xmm2  \n\t"
  59         " minps  %%xmm5, %%xmm0         \n\t" /* clamp to 1.0 */
  60         " minps  %%xmm5, %%xmm2         \n\t"
  61         " maxps  %%xmm6, %%xmm0         \n\t" /* clamp to -1.0 */
  62         " maxps  %%xmm6, %%xmm2         \n\t"
  63         " mulps  %%xmm7, %%xmm0         \n\t" /* *= 0x7fff */
  64         " mulps  %%xmm7, %%xmm2         \n\t"
  65
  66         " cvtps2pi %%xmm0, %%mm0        \n\t" /* low part to int */
  67         " cvtps2pi %%xmm2, %%mm2        \n\t"
  68         " movhlps  %%xmm0, %%xmm0       \n\t" /* bring high part in position */
  69         " movhlps  %%xmm2, %%xmm2       \n\t"
  70         " cvtps2pi %%xmm0, %%mm1        \n\t" /* high part to int */
  71         " cvtps2pi %%xmm2, %%mm3        \n\t"
  72
  73         " packssdw %%mm1, %%mm0         \n\t" /* pack parts */
  74         " packssdw %%mm3, %%mm2         \n\t"
  75         " movq     %%mm0, (%3, %0)      \n\t"
  76         " movq    %%mm2, 8(%3, %0)     \n\t"
  77
  78         " add $16, %0                   \n\t"
  79         " dec %1                        \n\t"
  80         " jne 1b                        \n\t"
  81
  82         "2:                             \n\t"
  83         " mov %4, %1                    \n\t" /* prepare for leftovers */
  84         " and $7, %1                    \n\t"
  85         " je 4f                         \n\t"
  86
  87         "3:                             \n\t"
  88         " movss (%2, %0, 2), %%xmm0     \n\t"
  89         " minss  %%xmm5, %%xmm0         \n\t"
  90         " maxss  %%xmm6, %%xmm0         \n\t"
  91         " mulss  %%xmm7, %%xmm0         \n\t"
  92         " cvtss2si %%xmm0, %4           \n\t"
  93         " movw  %w4, (%3, %0)           \n\t"
  94         " add $2, %0                    \n\t"
  95         " dec %1                        \n\t"
  96         " jne 3b                        \n\t"
  97
  98         "4:                             \n\t"
  99         " emms                          \n\t"
 100
 101         : "=&r" (i), "=&r" (temp)
 102         : "r" (a), "r" (b), "r" ((pa_reg_x86)n), "m" (*one), "m" (*mone), "m" (*scale)
 103         : "cc", "memory"
 104     );
 105 }
 106
 107 static void pa_sconv_s16le_from_f32ne_sse2(unsigned n, const float *a, int16_t *b) {
 108     pa_reg_x86 temp, i;
 109
 110     __asm__ __volatile__ (
 111         " movaps %5, %%xmm5             \n\t"
 112         " movaps %6, %%xmm6             \n\t"
 113         " movaps %7, %%xmm7             \n\t"
 114         " xor %0, %0                    \n\t"
 115
 116         " mov %4, %1                    \n\t"
 117         " sar $3, %1                    \n\t" /* 8 floats at a time */
 118         " cmp $0, %1                    \n\t"
 119         " je 2f                         \n\t"
 120
 121         "1:                             \n\t"
 122         " movups (%2, %0, 2), %%xmm0    \n\t" /* read 8 floats */
 123         " movups 16(%2, %0, 2), %%xmm2  \n\t"
 124         " minps  %%xmm5, %%xmm0         \n\t" /* clamp to 1.0 */
 125         " minps  %%xmm5, %%xmm2         \n\t"
 126         " maxps  %%xmm6, %%xmm0         \n\t" /* clamp to -1.0 */
 127         " maxps  %%xmm6, %%xmm2         \n\t"
 128         " mulps  %%xmm7, %%xmm0         \n\t" /* *= 0x7fff */
 129         " mulps  %%xmm7, %%xmm2         \n\t"
 130
 131         " cvtps2dq %%xmm0, %%xmm0       \n\t"
 132         " cvtps2dq %%xmm2, %%xmm2       \n\t"
 133
 134         " packssdw %%xmm2, %%xmm0       \n\t"
 135         " movdqu   %%xmm0, (%3, %0)     \n\t"
 136
 137         " add $16, %0                   \n\t"
 138         " dec %1                        \n\t"
 139         " jne 1b                        \n\t"
 140
 141         "2:                             \n\t"
 142         " mov %4, %1                    \n\t" /* prepare for leftovers */
 143         " and $7, %1                    \n\t"
 144         " je 4f                         \n\t"
 145
 146         "3:                             \n\t"
 147         " movss (%2, %0, 2), %%xmm0     \n\t"
 148         " minss  %%xmm5, %%xmm0         \n\t"
 149         " maxss  %%xmm6, %%xmm0         \n\t"
 150         " mulss  %%xmm7, %%xmm0         \n\t"
 151         " cvtss2si %%xmm0, %4           \n\t"
 152         " movw  %w4, (%3, %0)           \n\t"
 153         " add $2, %0                    \n\t"
 154         " dec %1                        \n\t"
 155         " jne 3b                        \n\t"
 156
 157         "4:                             \n\t"
 158
 159         : "=&r" (i), "=&r" (temp)
 160         : "r" (a), "r" (b), "r" ((pa_reg_x86)n), "m" (*one), "m" (*mone), "m" (*scale)
 161         : "cc", "memory"
 162     );
 163 }
 164
 165 #undef RUN_TEST
 166
 167 #ifdef RUN_TEST
 168 #define SAMPLES 1019
 169 #define TIMES 1000
 170
 171 static void run_test(void) {
 172     int16_t samples[SAMPLES];
 173     int16_t samples_ref[SAMPLES];
 174     float floats[SAMPLES];
 175     int i;
 176     pa_usec_t start, stop;
 177     pa_convert_func_t func;
 178
 179     printf("checking SSE %zd\n", sizeof(samples));
 180
 181     memset(samples_ref, 0, sizeof(samples_ref));
 182     memset(samples, 0, sizeof(samples));
 183
 184     for (i = 0; i < SAMPLES; i++) {
 185         floats[i] = (rand()/(RAND_MAX+2.2)) - 1.1;
 186     }
 187
 188     func = pa_get_convert_from_float32ne_function(PA_SAMPLE_S16LE);
 189     func(SAMPLES, floats, samples_ref);
 190     pa_sconv_s16le_from_f32ne_sse2(SAMPLES, floats, samples);
 191
 192     for (i = 0; i < SAMPLES; i++) {
 193         if (samples[i] != samples_ref[i]) {
 194             printf ("%d: %04x != %04x (%f)\n", i, samples[i], samples_ref[i],
 195                       floats[i]);
 196         }
 197     }
 198
 199     start = pa_rtclock_now();
 200     for (i = 0; i < TIMES; i++) {
 201         pa_sconv_s16le_from_f32ne_sse2(SAMPLES, floats, samples);
 202     }
 203     stop = pa_rtclock_now();
 204     pa_log_info("SSE: %llu usec.", (long long unsigned int)(stop - start));
 205
 206     start = pa_rtclock_now();
 207     for (i = 0; i < TIMES; i++) {
 208         func(SAMPLES, floats, samples_ref);
 209     }
 210     stop = pa_rtclock_now();
 211     pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
 212 }
 213 #endif
 214 #endif /* defined (__i386__) || defined (__amd64__) */
 215
 216
 217 void pa_convert_func_init_sse(pa_cpu_x86_flag_t flags) {
 218 #if !defined(__APPLE__) && defined (__i386__) || defined (__amd64__)
 219
 220 #ifdef RUN_TEST
 221     run_test();
 222 #endif
 223
 224     if (flags & PA_CPU_X86_SSE2) {
 225         pa_log_info("Initialising SSE2 optimized conversions.");
 226         pa_set_convert_from_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_sse2);
 227     } else {
 228         pa_log_info("Initialising SSE optimized conversions.");
 229         pa_set_convert_from_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_sse);
 230     }
 231
 232 #endif /* defined (__i386__) || defined (__amd64__) */
 233 }