src/pulsecore/sconv_sse.c

   1 /***
   2   This file is part of PulseAudio.
   3
   4   Copyright 2004-2006 Lennart Poettering
   5   Copyright 2006 Pierre Ossman <ossman@cendio.se> for Cendio AB
   6
   7   PulseAudio is free software; you can redistribute it and/or modify
   8   it under the terms of the GNU Lesser General Public License as published
   9   by the Free Software Foundation; either version 2.1 of the License,
  10   or (at your option) any later version.
  11
  12   PulseAudio is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with PulseAudio; if not, write to the Free Software
  19   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  20   USA.
  21 ***/
  22
  23 #ifdef HAVE_CONFIG_H
  24 #include <config.h>
  25 #endif
  26
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29
  30 #include <pulsecore/g711.h>
  31 #include <pulsecore/macro.h>
  32
  33 #include "endianmacros.h"
  34
  35 #include "cpu-x86.h"
  36 #include "sconv.h"
  37
  38 #if !defined(__APPLE__) && defined (__i386__) || defined (__amd64__)
  39
  40 static const PA_DECLARE_ALIGNED (16, float, one[4]) = { 1.0, 1.0, 1.0, 1.0 };
  41 static const PA_DECLARE_ALIGNED (16, float, mone[4]) = { -1.0, -1.0, -1.0, -1.0 };
  42 static const PA_DECLARE_ALIGNED (16, float, scale[4]) = { 0x7fff, 0x7fff, 0x7fff, 0x7fff };
  43
  44 static void pa_sconv_s16le_from_f32ne_sse(unsigned n, const float *a, int16_t *b) {
  45     pa_reg_x86 temp, i;
  46
  47     __asm__ __volatile__ (
  48         " movaps %5, %%xmm5             \n\t"
  49         " movaps %6, %%xmm6             \n\t"
  50         " movaps %7, %%xmm7             \n\t"
  51         " xor %0, %0                    \n\t"
  52
  53         " mov %4, %1                    \n\t"
  54         " sar $3, %1                    \n\t" /* 8 floats at a time */
  55         " cmp $0, %1                    \n\t"
  56         " je 2f                         \n\t"
  57
  58         "1:                             \n\t"
  59         " movups (%2, %0, 2), %%xmm0    \n\t" /* read 8 floats */
  60         " movups 16(%2, %0, 2), %%xmm2  \n\t"
  61         " minps  %%xmm5, %%xmm0         \n\t" /* clamp to 1.0 */
  62         " minps  %%xmm5, %%xmm2         \n\t"
  63         " maxps  %%xmm6, %%xmm0         \n\t" /* clamp to -1.0 */
  64         " maxps  %%xmm6, %%xmm2         \n\t"
  65         " mulps  %%xmm7, %%xmm0         \n\t" /* *= 0x7fff */
  66         " mulps  %%xmm7, %%xmm2         \n\t"
  67
  68         " cvtps2pi %%xmm0, %%mm0        \n\t" /* low part to int */
  69         " cvtps2pi %%xmm2, %%mm2        \n\t"
  70         " movhlps  %%xmm0, %%xmm0       \n\t" /* bring high part in position */
  71         " movhlps  %%xmm2, %%xmm2       \n\t"
  72         " cvtps2pi %%xmm0, %%mm1        \n\t" /* high part to int */
  73         " cvtps2pi %%xmm2, %%mm3        \n\t"
  74
  75         " packssdw %%mm1, %%mm0         \n\t" /* pack parts */
  76         " packssdw %%mm3, %%mm2         \n\t"
  77         " movq     %%mm0, (%3, %0)      \n\t"
  78         " movq    %%mm2, 8(%3, %0)     \n\t"
  79
  80         " add $16, %0                   \n\t"
  81         " dec %1                        \n\t"
  82         " jne 1b                        \n\t"
  83
  84         "2:                             \n\t"
  85         " mov %4, %1                    \n\t" /* prepare for leftovers */
  86         " and $7, %1                    \n\t"
  87         " je 4f                         \n\t"
  88
  89         "3:                             \n\t"
  90         " movss (%2, %0, 2), %%xmm0     \n\t"
  91         " minss  %%xmm5, %%xmm0         \n\t"
  92         " maxss  %%xmm6, %%xmm0         \n\t"
  93         " mulss  %%xmm7, %%xmm0         \n\t"
  94         " cvtss2si %%xmm0, %4           \n\t"
  95         " movw  %w4, (%3, %0)           \n\t"
  96         " add $2, %0                    \n\t"
  97         " dec %1                        \n\t"
  98         " jne 3b                        \n\t"
  99
 100         "4:                             \n\t"
 101         " emms                          \n\t"
 102
 103         : "=&r" (i), "=&r" (temp)
 104         : "r" (a), "r" (b), "r" ((pa_reg_x86)n), "m" (*one), "m" (*mone), "m" (*scale)
 105         : "cc", "memory"
 106     );
 107 }
 108
 109 static void pa_sconv_s16le_from_f32ne_sse2(unsigned n, const float *a, int16_t *b) {
 110     pa_reg_x86 temp, i;
 111
 112     __asm__ __volatile__ (
 113         " movaps %5, %%xmm5             \n\t"
 114         " movaps %6, %%xmm6             \n\t"
 115         " movaps %7, %%xmm7             \n\t"
 116         " xor %0, %0                    \n\t"
 117
 118         " mov %4, %1                    \n\t"
 119         " sar $3, %1                    \n\t" /* 8 floats at a time */
 120         " cmp $0, %1                    \n\t"
 121         " je 2f                         \n\t"
 122
 123         "1:                             \n\t"
 124         " movups (%2, %0, 2), %%xmm0    \n\t" /* read 8 floats */
 125         " movups 16(%2, %0, 2), %%xmm2  \n\t"
 126         " minps  %%xmm5, %%xmm0         \n\t" /* clamp to 1.0 */
 127         " minps  %%xmm5, %%xmm2         \n\t"
 128         " maxps  %%xmm6, %%xmm0         \n\t" /* clamp to -1.0 */
 129         " maxps  %%xmm6, %%xmm2         \n\t"
 130         " mulps  %%xmm7, %%xmm0         \n\t" /* *= 0x7fff */
 131         " mulps  %%xmm7, %%xmm2         \n\t"
 132
 133         " cvtps2dq %%xmm0, %%xmm0       \n\t"
 134         " cvtps2dq %%xmm2, %%xmm2       \n\t"
 135
 136         " packssdw %%xmm2, %%xmm0       \n\t"
 137         " movdqu   %%xmm0, (%3, %0)     \n\t"
 138
 139         " add $16, %0                   \n\t"
 140         " dec %1                        \n\t"
 141         " jne 1b                        \n\t"
 142
 143         "2:                             \n\t"
 144         " mov %4, %1                    \n\t" /* prepare for leftovers */
 145         " and $7, %1                    \n\t"
 146         " je 4f                         \n\t"
 147
 148         "3:                             \n\t"
 149         " movss (%2, %0, 2), %%xmm0     \n\t"
 150         " minss  %%xmm5, %%xmm0         \n\t"
 151         " maxss  %%xmm6, %%xmm0         \n\t"
 152         " mulss  %%xmm7, %%xmm0         \n\t"
 153         " cvtss2si %%xmm0, %4           \n\t"
 154         " movw  %w4, (%3, %0)           \n\t"
 155         " add $2, %0                    \n\t"
 156         " dec %1                        \n\t"
 157         " jne 3b                        \n\t"
 158
 159         "4:                             \n\t"
 160
 161         : "=&r" (i), "=&r" (temp)
 162         : "r" (a), "r" (b), "r" ((pa_reg_x86)n), "m" (*one), "m" (*mone), "m" (*scale)
 163         : "cc", "memory"
 164     );
 165 }
 166
 167 #undef RUN_TEST
 168
 169 #ifdef RUN_TEST
 170 #define SAMPLES 1019
 171 #define TIMES 1000
 172
 173 static void run_test(void) {
 174     int16_t samples[SAMPLES];
 175     int16_t samples_ref[SAMPLES];
 176     float floats[SAMPLES];
 177     int i;
 178     pa_usec_t start, stop;
 179     pa_convert_func_t func;
 180
 181     printf("checking SSE %zd\n", sizeof(samples));
 182
 183     memset(samples_ref, 0, sizeof(samples_ref));
 184     memset(samples, 0, sizeof(samples));
 185
 186     for (i = 0; i < SAMPLES; i++) {
 187         floats[i] = (rand()/(RAND_MAX+2.2)) - 1.1;
 188     }
 189
 190     func = pa_get_convert_from_float32ne_function(PA_SAMPLE_S16LE);
 191     func(SAMPLES, floats, samples_ref);
 192     pa_sconv_s16le_from_f32ne_sse2(SAMPLES, floats, samples);
 193
 194     for (i = 0; i < SAMPLES; i++) {
 195         if (samples[i] != samples_ref[i]) {
 196             printf ("%d: %04x != %04x (%f)\n", i, samples[i], samples_ref[i],
 197                       floats[i]);
 198         }
 199     }
 200
 201     start = pa_rtclock_now();
 202     for (i = 0; i < TIMES; i++) {
 203         pa_sconv_s16le_from_f32ne_sse2(SAMPLES, floats, samples);
 204     }
 205     stop = pa_rtclock_now();
 206     pa_log_info("SSE: %llu usec.", (long long unsigned int)(stop - start));
 207
 208     start = pa_rtclock_now();
 209     for (i = 0; i < TIMES; i++) {
 210         func(SAMPLES, floats, samples_ref);
 211     }
 212     stop = pa_rtclock_now();
 213     pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
 214 }
 215 #endif
 216 #endif /* defined (__i386__) || defined (__amd64__) */
 217
 218
 219 void pa_convert_func_init_sse(pa_cpu_x86_flag_t flags) {
 220 #if !defined(__APPLE__) && defined (__i386__) || defined (__amd64__)
 221
 222 #ifdef RUN_TEST
 223     run_test();
 224 #endif
 225
 226     if (flags & PA_CPU_X86_SSE2) {
 227         pa_log_info("Initialising SSE2 optimized conversions.");
 228         pa_set_convert_from_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_sse2);
 229     } else {
 230         pa_log_info("Initialising SSE optimized conversions.");
 231         pa_set_convert_from_float32ne_function(PA_SAMPLE_S16LE, (pa_convert_func_t) pa_sconv_s16le_from_f32ne_sse);
 232     }
 233
 234 #endif /* defined (__i386__) || defined (__amd64__) */
 235 }