src/pulsecore/svolume_mmx.c

   1 /***
   2   This file is part of PulseAudio.
   3
   4   Copyright 2004-2006 Lennart Poettering
   5   Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk>
   6
   7   PulseAudio is free software; you can redistribute it and/or modify
   8   it under the terms of the GNU Lesser General Public License as published
   9   by the Free Software Foundation; either version 2.1 of the License,
  10   or (at your option) any later version.
  11
  12   PulseAudio is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with PulseAudio; if not, write to the Free Software
  19   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
  20   USA.
  21 ***/
  22
  23 #ifdef HAVE_CONFIG_H
  24 #include <config.h>
  25 #endif
  26
  27 #include <pulse/timeval.h>
  28 #include <pulsecore/random.h>
  29 #include <pulsecore/macro.h>
  30 #include <pulsecore/g711.h>
  31 #include <pulsecore/core-util.h>
  32
  33 #include "cpu-x86.h"
  34
  35 #include "sample-util.h"
  36 #include "endianmacros.h"
  37
  38 #if 0
  39 static void
  40 pa_volume_u8_mmx (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
  41 {
  42   unsigned channel;
  43
  44   for (channel = 0; length; length--) {
  45     int32_t t, hi, lo;
  46
  47     hi = volumes[channel] >> 16;
  48     lo = volumes[channel] & 0xFFFF;
  49
  50     t = (int32_t) *samples - 0x80;
  51     t = ((t * lo) >> 16) + (t * hi);
  52     t = PA_CLAMP_UNLIKELY(t, -0x80, 0x7F);
  53     *samples++ = (uint8_t) (t + 0x80);
  54
  55     if (PA_UNLIKELY(++channel >= channels))
  56       channel = 0;
  57   }
  58 }
  59
  60 static void
  61 pa_volume_alaw_mmx (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
  62 {
  63   unsigned channel;
  64
  65   for (channel = 0; length; length--) {
  66     int32_t t, hi, lo;
  67
  68     hi = volumes[channel] >> 16;
  69     lo = volumes[channel] & 0xFFFF;
  70
  71     t = (int32_t) st_alaw2linear16(*samples);
  72     t = ((t * lo) >> 16) + (t * hi);
  73     t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
  74     *samples++ = (uint8_t) st_13linear2alaw((int16_t) t >> 3);
  75
  76     if (PA_UNLIKELY(++channel >= channels))
  77       channel = 0;
  78   }
  79 }
  80
  81 static void
  82 pa_volume_ulaw_mmx (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
  83 {
  84   unsigned channel;
  85
  86   for (channel = 0; length; length--) {
  87     int32_t t, hi, lo;
  88
  89     hi = volumes[channel] >> 16;
  90     lo = volumes[channel] & 0xFFFF;
  91
  92     t = (int32_t) st_ulaw2linear16(*samples);
  93     t = ((t * lo) >> 16) + (t * hi);
  94     t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
  95     *samples++ = (uint8_t) st_14linear2ulaw((int16_t) t >> 2);
  96
  97     if (PA_UNLIKELY(++channel >= channels))
  98       channel = 0;
  99   }
 100 }
 101 #endif
 102
 103 /* in s: 2 int16_t samples
 104  * in v: 2 int32_t volumes, fixed point 16:16
 105  * out s: contains scaled and clamped int16_t samples.
 106  *
 107  * We calculate the high 32 bits of a 32x16 multiply which we then
 108  * clamp to 16 bits. The calulcation is:
 109  *
 110  *  vl = (v & 0xffff)
 111  *  vh = (v >> 16)
 112  *  s = ((s * vl) >> 16) + (s * vh);
 113  *
 114  * For the first multiply we have to do a sign correction as we need to
 115  * multiply a signed int with an unsigned int. Hacker's delight 8-3 gives a
 116  * simple formula to correct the sign of the high word after the signed
 117  * multiply.
 118  */
 119 #define VOLUME_32x16(s,v)                  /* .. |   vh  |   vl  | */                   \
 120       " pxor  %%mm4, %%mm4           \n\t" /* .. |    0  |    0  | */                   \
 121       " punpcklwd %%mm4, "#s"        \n\t" /* .. |    0  |   p0  | */                   \
 122       " pcmpgtw "#v", %%mm4          \n\t" /* .. |    0  | s(vl) | */                   \
 123       " pand "#s", %%mm4             \n\t" /* .. |    0  |  (p0) |  (vl >> 15) & p */   \
 124       " movq %%mm6, %%mm5            \n\t" /* .. |  ffff |   0   | */                   \
 125       " pand "#v", %%mm5             \n\t" /* .. |   vh  |   0   | */                   \
 126       " por %%mm5, %%mm4             \n\t" /* .. |   vh  |  (p0) | */                   \
 127       " pmulhw "#s", "#v"            \n\t" /* .. |    0  | vl*p0 | */                   \
 128       " paddw %%mm4, "#v"            \n\t" /* .. |   vh  | vl*p0 | vh + sign correct */ \
 129       " pslld $16, "#s"              \n\t" /* .. |   p0  |    0  | */                   \
 130       " por %%mm7, "#s"              \n\t" /* .. |   p0  |    1  | */                   \
 131       " pmaddwd "#s", "#v"           \n\t" /* .. |    p0 * v0    | */                   \
 132       " packssdw "#v", "#v"          \n\t" /* .. | p1*v1 | p0*v0 | */
 133
 134 /* approximately advances %3 = (%3 + a) % b. This function requires that
 135  * a <= b. */
 136 #define MOD_ADD(a,b) \
 137       " add "#a", %3                 \n\t" \
 138       " mov %3, %4                   \n\t" \
 139       " sub "#b", %4                 \n\t" \
 140       " cmp "#b", %3                 \n\t" \
 141       " cmovae %4, %3                \n\t"
 142
 143 /* swap 16 bits */
 144 #define SWAP_16(s) \
 145       " movq "#s", %%mm4             \n\t" /* .. |  h  l |  */ \
 146       " psrlw $8, %%mm4              \n\t" /* .. |  0  h |  */ \
 147       " psllw $8, "#s"               \n\t" /* .. |  l  0 |  */ \
 148       " por %%mm4, "#s"              \n\t" /* .. |  l  h |  */
 149
 150 /* swap 2 registers 16 bits for better pairing */
 151 #define SWAP_16_2(s1,s2) \
 152       " movq "#s1", %%mm4            \n\t" /* .. |  h  l |  */ \
 153       " movq "#s2", %%mm5            \n\t"                     \
 154       " psrlw $8, %%mm4              \n\t" /* .. |  0  h |  */ \
 155       " psrlw $8, %%mm5              \n\t"                     \
 156       " psllw $8, "#s1"              \n\t" /* .. |  l  0 |  */ \
 157       " psllw $8, "#s2"              \n\t"                     \
 158       " por %%mm4, "#s1"             \n\t" /* .. |  l  h |  */ \
 159       " por %%mm5, "#s2"             \n\t"
 160
 161 static void
 162 pa_volume_s16ne_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
 163 {
 164   pa_reg_x86 channel, temp;
 165
 166   /* the max number of samples we process at a time, this is also the max amount
 167    * we overread the volume array, which should have enough padding. */
 168   channels = MAX (4, channels);
 169
 170   __asm__ __volatile__ (
 171     " xor %3, %3                    \n\t"
 172     " sar $1, %2                    \n\t" /* length /= sizeof (int16_t) */
 173     " pcmpeqw %%mm6, %%mm6          \n\t" /* .. |  ffff |  ffff | */
 174     " pcmpeqw %%mm7, %%mm7          \n\t" /* .. |  ffff |  ffff | */
 175     " pslld  $16, %%mm6             \n\t" /* .. |  ffff |     0 | */
 176     " psrld  $31, %%mm7             \n\t" /* .. |     0 |     1 | */
 177
 178     " test $1, %2                   \n\t" /* check for odd samples */
 179     " je 2f                         \n\t"
 180
 181     " movd (%1, %3, 4), %%mm0       \n\t" /* |  v0h  |  v0l  | */
 182     " movw (%0), %4                 \n\t" /*     ..  |  p0   | */
 183     " movd %4, %%mm1                \n\t"
 184     VOLUME_32x16 (%%mm1, %%mm0)
 185     " movd %%mm0, %4                \n\t" /*     ..  | p0*v0 | */
 186     " movw %4, (%0)                 \n\t"
 187     " add $2, %0                    \n\t"
 188     MOD_ADD ($1, %5)
 189
 190     "2:                             \n\t"
 191     " sar $1, %2                    \n\t" /* prepare for processing 2 samples at a time */
 192     " test $1, %2                   \n\t" /* check for odd samples */
 193     " je 4f                         \n\t"
 194
 195     "3:                             \n\t" /* do samples in groups of 2 */
 196     " movq (%1, %3, 4), %%mm0       \n\t" /* |  v1h  |  v1l  |  v0h  |  v0l  | */
 197     " movd (%0), %%mm1              \n\t" /*              .. |   p1  |  p0   | */
 198     VOLUME_32x16 (%%mm1, %%mm0)
 199     " movd %%mm0, (%0)              \n\t" /* | p1*v1 | p0*v0 | */
 200     " add $4, %0                    \n\t"
 201     MOD_ADD ($2, %5)
 202
 203     "4:                             \n\t"
 204     " sar $1, %2                    \n\t" /* prepare for processing 4 samples at a time */
 205     " cmp $0, %2                    \n\t"
 206     " je 6f                         \n\t"
 207
 208     "5:                             \n\t" /* do samples in groups of 4 */
 209     " movq (%1, %3, 4), %%mm0       \n\t" /* |  v1h  |  v1l  |  v0h  |  v0l  | */
 210     " movq 8(%1, %3, 4), %%mm2      \n\t" /* |  v3h  |  v3l  |  v2h  |  v2l  | */
 211     " movd (%0), %%mm1              \n\t" /*              .. |   p1  |  p0   | */
 212     " movd 4(%0), %%mm3             \n\t" /*              .. |   p3  |  p2   | */
 213     VOLUME_32x16 (%%mm1, %%mm0)
 214     VOLUME_32x16 (%%mm3, %%mm2)
 215     " movd %%mm0, (%0)              \n\t" /* | p1*v1 | p0*v0 | */
 216     " movd %%mm2, 4(%0)             \n\t" /* | p3*v3 | p2*v2 | */
 217     " add $8, %0                    \n\t"
 218     MOD_ADD ($4, %5)
 219     " dec %2                        \n\t"
 220     " jne 5b                        \n\t"
 221
 222     "6:                             \n\t"
 223     " emms                          \n\t"
 224
 225     : "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp)
 226     : "r" ((pa_reg_x86)channels)
 227     : "cc"
 228   );
 229 }
 230
 231 static void
 232 pa_volume_s16re_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
 233 {
 234   pa_reg_x86 channel, temp;
 235
 236   /* the max number of samples we process at a time, this is also the max amount
 237    * we overread the volume array, which should have enough padding. */
 238   channels = MAX (4, channels);
 239
 240   __asm__ __volatile__ (
 241     " xor %3, %3                    \n\t"
 242     " sar $1, %2                    \n\t" /* length /= sizeof (int16_t) */
 243     " pcmpeqw %%mm6, %%mm6          \n\t" /* .. |  ffff |  ffff | */
 244     " pcmpeqw %%mm7, %%mm7          \n\t" /* .. |  ffff |  ffff | */
 245     " pslld  $16, %%mm6             \n\t" /* .. |  ffff |     0 | */
 246     " psrld  $31, %%mm7             \n\t" /* .. |     0 |     1 | */
 247
 248     " test $1, %2                   \n\t" /* check for odd samples */
 249     " je 2f                         \n\t"
 250
 251     " movd (%1, %3, 4), %%mm0       \n\t" /* |  v0h  |  v0l  | */
 252     " movw (%0), %4                 \n\t" /*     ..  |  p0   | */
 253     " rorw $8, %4                   \n\t"
 254     " movd %4, %%mm1                \n\t"
 255     VOLUME_32x16 (%%mm1, %%mm0)
 256     " movd %%mm0, %4                \n\t" /*     ..  | p0*v0 | */
 257     " rorw $8, %4                   \n\t"
 258     " movw %4, (%0)                 \n\t"
 259     " add $2, %0                    \n\t"
 260     MOD_ADD ($1, %5)
 261
 262     "2:                             \n\t"
 263     " sar $1, %2                    \n\t" /* prepare for processing 2 samples at a time */
 264     " test $1, %2                   \n\t" /* check for odd samples */
 265     " je 4f                         \n\t"
 266
 267     "3:                             \n\t" /* do samples in groups of 2 */
 268     " movq (%1, %3, 4), %%mm0       \n\t" /* |  v1h  |  v1l  |  v0h  |  v0l  | */
 269     " movd (%0), %%mm1              \n\t" /*              .. |   p1  |  p0   | */
 270     SWAP_16 (%%mm1)
 271     VOLUME_32x16 (%%mm1, %%mm0)
 272     SWAP_16 (%%mm0)
 273     " movd %%mm0, (%0)              \n\t" /* | p1*v1 | p0*v0 | */
 274     " add $4, %0                    \n\t"
 275     MOD_ADD ($2, %5)
 276
 277     "4:                             \n\t"
 278     " sar $1, %2                    \n\t" /* prepare for processing 4 samples at a time */
 279     " cmp $0, %2                    \n\t"
 280     " je 6f                         \n\t"
 281
 282     "5:                             \n\t" /* do samples in groups of 4 */
 283     " movq (%1, %3, 4), %%mm0       \n\t" /* |  v1h  |  v1l  |  v0h  |  v0l  | */
 284     " movq 8(%1, %3, 4), %%mm2      \n\t" /* |  v3h  |  v3l  |  v2h  |  v2l  | */
 285     " movd (%0), %%mm1              \n\t" /*              .. |   p1  |  p0   | */
 286     " movd 4(%0), %%mm3             \n\t" /*              .. |   p3  |  p2   | */
 287     SWAP_16_2 (%%mm1, %%mm3)
 288     VOLUME_32x16 (%%mm1, %%mm0)
 289     VOLUME_32x16 (%%mm3, %%mm2)
 290     SWAP_16_2 (%%mm0, %%mm2)
 291     " movd %%mm0, (%0)              \n\t" /* | p1*v1 | p0*v0 | */
 292     " movd %%mm2, 4(%0)             \n\t" /* | p3*v3 | p2*v2 | */
 293     " add $8, %0                    \n\t"
 294     MOD_ADD ($4, %5)
 295     " dec %2                        \n\t"
 296     " jne 5b                        \n\t"
 297
 298     "6:                             \n\t"
 299     " emms                          \n\t"
 300
 301     : "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp)
 302     : "r" ((pa_reg_x86)channels)
 303     : "cc"
 304   );
 305 }
 306
 307 #if 0
 308 static void
 309 pa_volume_float32ne_mmx (float *samples, float *volumes, unsigned channels, unsigned length)
 310 {
 311   unsigned channel;
 312
 313   length /= sizeof (float);
 314
 315   for (channel = 0; length; length--) {
 316     *samples++ *= volumes[channel];
 317
 318     if (PA_UNLIKELY(++channel >= channels))
 319       channel = 0;
 320   }
 321 }
 322
 323 static void
 324 pa_volume_float32re_mmx (float *samples, float *volumes, unsigned channels, unsigned length)
 325 {
 326   unsigned channel;
 327
 328   length /= sizeof (float);
 329
 330   for (channel = 0; length; length--) {
 331     float t;
 332
 333     t = PA_FLOAT32_SWAP(*samples);
 334     t *= volumes[channel];
 335     *samples++ = PA_FLOAT32_SWAP(t);
 336
 337     if (PA_UNLIKELY(++channel >= channels))
 338       channel = 0;
 339   }
 340 }
 341
 342 static void
 343 pa_volume_s32ne_mmx (int32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
 344 {
 345   unsigned channel;
 346
 347   length /= sizeof (int32_t);
 348
 349   for (channel = 0; length; length--) {
 350     int64_t t;
 351
 352     t = (int64_t)(*samples);
 353     t = (t * volumes[channel]) >> 16;
 354     t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
 355     *samples++ = (int32_t) t;
 356
 357     if (PA_UNLIKELY(++channel >= channels))
 358       channel = 0;
 359   }
 360 }
 361
 362 static void
 363 pa_volume_s32re_mmx (int32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
 364 {
 365   unsigned channel;
 366
 367   length /= sizeof (int32_t);
 368
 369   for (channel = 0; length; length--) {
 370     int64_t t;
 371
 372     t = (int64_t) PA_INT32_SWAP(*samples);
 373     t = (t * volumes[channel]) >> 16;
 374     t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
 375     *samples++ = PA_INT32_SWAP((int32_t) t);
 376
 377     if (PA_UNLIKELY(++channel >= channels))
 378       channel = 0;
 379   }
 380 }
 381
 382 static void
 383 pa_volume_s24ne_mmx (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
 384 {
 385   unsigned channel;
 386   uint8_t *e;
 387
 388   e = samples + length;
 389
 390   for (channel = 0; samples < e; samples += 3) {
 391     int64_t t;
 392
 393     t = (int64_t)((int32_t) (PA_READ24NE(samples) << 8));
 394     t = (t * volumes[channel]) >> 16;
 395     t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
 396     PA_WRITE24NE(samples, ((uint32_t) (int32_t) t) >> 8);
 397
 398     if (PA_UNLIKELY(++channel >= channels))
 399       channel = 0;
 400   }
 401 }
 402
 403 static void
 404 pa_volume_s24re_mmx (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
 405 {
 406   unsigned channel;
 407   uint8_t *e;
 408
 409   e = samples + length;
 410
 411   for (channel = 0; samples < e; samples += 3) {
 412     int64_t t;
 413
 414     t = (int64_t)((int32_t) (PA_READ24RE(samples) << 8));
 415     t = (t * volumes[channel]) >> 16;
 416     t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
 417     PA_WRITE24RE(samples, ((uint32_t) (int32_t) t) >> 8);
 418
 419     if (PA_UNLIKELY(++channel >= channels))
 420       channel = 0;
 421   }
 422 }
 423
 424 static void
 425 pa_volume_s24_32ne_mmx (uint32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
 426 {
 427   unsigned channel;
 428
 429   length /= sizeof (uint32_t);
 430
 431   for (channel = 0; length; length--) {
 432     int64_t t;
 433
 434     t = (int64_t) ((int32_t) (*samples << 8));
 435     t = (t * volumes[channel]) >> 16;
 436     t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
 437     *samples++ = ((uint32_t) ((int32_t) t)) >> 8;
 438
 439     if (PA_UNLIKELY(++channel >= channels))
 440       channel = 0;
 441   }
 442 }
 443
 444 static void
 445 pa_volume_s24_32re_mmx (uint32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
 446 {
 447   unsigned channel;
 448
 449   length /= sizeof (uint32_t);
 450
 451   for (channel = 0; length; length--) {
 452     int64_t t;
 453
 454     t = (int64_t) ((int32_t) (PA_UINT32_SWAP(*samples) << 8));
 455     t = (t * volumes[channel]) >> 16;
 456     t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
 457     *samples++ = PA_UINT32_SWAP(((uint32_t) ((int32_t) t)) >> 8);
 458
 459     if (PA_UNLIKELY(++channel >= channels))
 460       channel = 0;
 461   }
 462 }
 463 #endif
 464
 465 #undef RUN_TEST
 466
 467 #ifdef RUN_TEST
 468 #define CHANNELS 2
 469 #define SAMPLES 1021
 470 #define TIMES 1000
 471 #define PADDING 16
 472
 473 static void run_test (void) {
 474   int16_t samples[SAMPLES];
 475   int16_t samples_ref[SAMPLES];
 476   int16_t samples_orig[SAMPLES];
 477   int32_t volumes[CHANNELS + PADDING];
 478   int i, j, padding;
 479   pa_do_volume_func_t func;
 480   struct timeval start, stop;
 481
 482   func = pa_get_volume_func (PA_SAMPLE_S16NE);
 483
 484   printf ("checking MMX %zd\n", sizeof (samples));
 485
 486   pa_random (samples, sizeof (samples));
 487   memcpy (samples_ref, samples, sizeof (samples));
 488   memcpy (samples_orig, samples, sizeof (samples));
 489
 490   for (i = 0; i < CHANNELS; i++)
 491     volumes[i] = rand() >> 1;
 492   for (padding = 0; padding < PADDING; padding++, i++)
 493     volumes[i] = volumes[padding];
 494
 495   func (samples_ref, volumes, CHANNELS, sizeof (samples));
 496   pa_volume_s16ne_mmx (samples, volumes, CHANNELS, sizeof (samples));
 497   for (i = 0; i < SAMPLES; i++) {
 498     if (samples[i] != samples_ref[i]) {
 499       printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
 500           samples_orig[i], volumes[i % CHANNELS]);
 501     }
 502   }
 503
 504   pa_gettimeofday(&start);
 505   for (j = 0; j < TIMES; j++) {
 506     memcpy (samples, samples_orig, sizeof (samples));
 507     pa_volume_s16ne_mmx (samples, volumes, CHANNELS, sizeof (samples));
 508   }
 509   pa_gettimeofday(&stop);
 510   pa_log_info("MMX: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
 511
 512   pa_gettimeofday(&start);
 513   for (j = 0; j < TIMES; j++) {
 514     memcpy (samples_ref, samples_orig, sizeof (samples));
 515     func (samples_ref, volumes, CHANNELS, sizeof (samples));
 516   }
 517   pa_gettimeofday(&stop);
 518   pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
 519 }
 520 #endif
 521
 522 void pa_volume_func_init_mmx (pa_cpu_x86_flag_t flags) {
 523   pa_log_info("Initialising MMX optimized functions.");
 524
 525 #ifdef RUN_TEST
 526   run_test ();
 527 #endif
 528
 529   pa_set_volume_func (PA_SAMPLE_S16NE,     (pa_do_volume_func_t) pa_volume_s16ne_mmx);
 530   pa_set_volume_func (PA_SAMPLE_S16RE,     (pa_do_volume_func_t) pa_volume_s16re_mmx);
 531 }