apps/codecs/libtremor/asm_mcf5249.h

   1 /***************************************************************************
   2  *             __________               __   ___.
   3  *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
   4  *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
   5  *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
   6  *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
   7  *                     \/            \/     \/    \/            \/
   8  *
   9  * Copyright (C) 2005 by Pedro Vasconcelos
  10  *
  11  * This program is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU General Public License
  13  * as published by the Free Software Foundation; either version 2
  14  * of the License, or (at your option) any later version.
  15  *
  16  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  17  * KIND, either express or implied.
  18  *
  19  ****************************************************************************/
  20 /* asm routines for wide math on the MCF5249 */
  21
  22 #include "os_types.h"
  23
  24 #if defined(CPU_COLDFIRE)
  25
  26 /* attribute for 16-byte alignment */
  27 #define LINE_ATTR   __attribute__ ((aligned (16)))
  28
  29 #ifndef _V_WIDE_MATH
  30 #define _V_WIDE_MATH
  31
  32 #define MB()
  33
  34 static inline ogg_int32_t MULT32(ogg_int32_t x, ogg_int32_t y) {
  35
  36   asm volatile ("mac.l %[x], %[y], %%acc0;"    /* multiply & shift  */
  37                 "movclr.l %%acc0, %[x];"       /* move & clear acc */
  38                 "asr.l #1, %[x];"              /* no overflow test */
  39                 : [x] "+&d" (x)
  40                 : [y] "r" (y)
  41                 : "cc");
  42   return x;
  43 }
  44
  45 static inline ogg_int32_t MULT31(ogg_int32_t x, ogg_int32_t y) {
  46
  47   asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply */
  48                 "movclr.l %%acc0, %[x];"    /* move and clear */
  49                 : [x] "+&r" (x)
  50                 : [y] "r" (y)
  51                 : "cc");
  52   return x;
  53 }
  54
  55
  56 static inline ogg_int32_t MULT31_SHIFT15(ogg_int32_t x, ogg_int32_t y) {
  57   ogg_int32_t r;
  58
  59   asm volatile ("mac.l %[x], %[y], %%acc0;"  /* multiply */
  60                 "mulu.l %[y], %[x];"         /* get lower half, avoid emac stall */
  61                 "movclr.l %%acc0, %[r];"     /* get higher half */
  62                 "asl.l #8, %[r];"            /* hi<<16, plus one free */
  63                 "asl.l #8, %[r];"
  64                 "lsr.l #8, %[x];"            /* (unsigned)lo >> 15 */
  65                 "lsr.l #7, %[x];"
  66                 "or.l %[x], %[r];"           /* logical-or results */
  67                 : [r] "=&d" (r), [x] "+d" (x)
  68                 : [y] "d" (y)
  69                 : "cc");
  70   return r;
  71 }
  72
  73
  74 static inline
  75 void XPROD31(ogg_int32_t  a, ogg_int32_t  b,
  76              ogg_int32_t  t, ogg_int32_t  v,
  77              ogg_int32_t *x, ogg_int32_t *y)
  78 {
  79   asm volatile ("mac.l %[a], %[t], %%acc0;"
  80                 "mac.l %[b], %[v], %%acc0;"
  81                 "mac.l %[b], %[t], %%acc1;"
  82                 "msac.l %[a], %[v], %%acc1;"
  83                 "movclr.l %%acc0, %[a];"
  84                 "move.l %[a], (%[x]);"
  85                 "movclr.l %%acc1, %[a];"
  86                 "move.l %[a], (%[y]);"
  87                 : [a] "+&r" (a)
  88                 : [x] "a" (x), [y] "a" (y),
  89                   [b] "r" (b), [t] "r" (t), [v] "r" (v)
  90                 : "cc", "memory");
  91 }
  92
  93
  94 static inline
  95 void XNPROD31(ogg_int32_t  a, ogg_int32_t  b,
  96               ogg_int32_t  t, ogg_int32_t  v,
  97               ogg_int32_t *x, ogg_int32_t *y)
  98 {
  99   asm volatile ("mac.l %[a], %[t], %%acc0;"
 100                 "msac.l %[b], %[v], %%acc0;"
 101                 "mac.l %[b], %[t], %%acc1;"
 102                 "mac.l %[a], %[v], %%acc1;"
 103                 "movclr.l %%acc0, %[a];"
 104                 "move.l %[a], (%[x]);"
 105                 "movclr.l %%acc1, %[a];"
 106                 "move.l %[a], (%[y]);"
 107                 : [a] "+&r" (a)
 108                 : [x] "a" (x), [y] "a" (y),
 109                   [b] "r" (b), [t] "r" (t), [v] "r" (v)
 110                 : "cc", "memory");
 111 }
 112
 113
 114 #if 0    /* canonical Tremor definition */
 115 #define XPROD32(_a, _b, _t, _v, _x, _y)         \
 116   { (_x)=MULT32(_a,_t)+MULT32(_b,_v);           \
 117     (_y)=MULT32(_b,_t)-MULT32(_a,_v); }
 118 #endif
 119
 120 /* this could lose the LSB by overflow, but i don't think it'll ever happen.
 121    if anyone think they can hear a bug caused by this, please try the above
 122    version. */
 123 #define XPROD32(_a, _b, _t, _v, _x, _y)     \
 124   asm volatile ("mac.l %[a], %[t], %%acc0;" \
 125                 "mac.l %[b], %[v], %%acc0;" \
 126                 "mac.l %[b], %[t], %%acc1;" \
 127                 "msac.l %[a], %[v], %%acc1;" \
 128                 "movclr.l %%acc0, %[x];" \
 129                 "asr.l #1, %[x];" \
 130                 "movclr.l %%acc1, %[y];" \
 131                 "asr.l #1, %[y];" \
 132                 : [x] "=&d" (_x), [y] "=&d" (_y) \
 133                 : [a] "r" (_a), [b] "r" (_b), \
 134                   [t] "r" (_t), [v] "r" (_v) \
 135                 : "cc");
 136
 137 #ifndef _V_VECT_OPS
 138 #define _V_VECT_OPS
 139
 140 /* asm versions of vector operations for block.c, window.c */
 141 /* assumes MAC is initialized & accumulators cleared */
 142 static inline
 143 void vect_add_right_left(ogg_int32_t *x, const ogg_int32_t *y, int n)
 144 {
 145   /* align to 16 bytes */
 146   while(n>0 && (int)x&15) {
 147     *x++ += *y++;
 148     n--;
 149   }
 150   asm volatile ("bra 1f;"
 151                 "0:"                          /* loop start */
 152                 "movem.l (%[x]), %%d0-%%d3;"  /* fetch values */
 153                 "movem.l (%[y]), %%a0-%%a3;"
 154                 /* add */
 155                 "add.l %%a0, %%d0;"
 156                 "add.l %%a1, %%d1;"
 157                 "add.l %%a2, %%d2;"
 158                 "add.l %%a3, %%d3;"
 159                 /* store and advance */
 160                 "movem.l %%d0-%%d3, (%[x]);"
 161                 "lea.l (4*4, %[x]), %[x];"
 162                 "lea.l (4*4, %[y]), %[y];"
 163                 "subq.l #4, %[n];"     /* done 4 elements */
 164                 "1: cmpi.l #4, %[n];"
 165                 "bge 0b;"
 166                 : [n] "+d" (n), [x] "+a" (x), [y] "+a" (y)
 167                 : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
 168                     "cc", "memory");
 169   /* add final elements */
 170   while (n>0) {
 171     *x++ += *y++;
 172     n--;
 173   }
 174 }
 175 static inline
 176 void vect_add_left_right(ogg_int32_t *x, const ogg_int32_t *y, int n)
 177 {
 178     /* coldfire asm has symmetrical versions of vect_add_right_left
 179        and vect_add_left_right  (since symmetrical versions of
 180        vect_mult_fw and vect_mult_bw  i.e.  both use MULT31) */
 181     vect_add_right_left(x, y, n );
 182 }
 183
 184 static inline
 185 void vect_copy(ogg_int32_t *x, const ogg_int32_t *y, int n)
 186 {
 187   /* align to 16 bytes */
 188   while(n>0 && (int)x&15) {
 189     *x++ = *y++;
 190     n--;
 191   }
 192   asm volatile ("bra 1f;"
 193                 "0:"                                    /* loop start */
 194                 "movem.l (%[y]), %%d0-%%d3;"            /* fetch values */
 195                 "movem.l %%d0-%%d3, (%[x]);"            /* store */
 196                 "lea.l (4*4, %[x]), %[x];"              /* advance */
 197                 "lea.l (4*4, %[y]), %[y];"
 198                 "subq.l #4, %[n];"                      /* done 4 elements */
 199                 "1: cmpi.l #4, %[n];"
 200                 "bge 0b;"
 201                 : [n] "+d" (n), [x] "+a" (x), [y] "+a" (y)
 202                 : : "%d0", "%d1", "%d2", "%d3", "cc", "memory");
 203   /* copy final elements */
 204   while (n>0) {
 205     *x++ = *y++;
 206     n--;
 207   }
 208 }
 209
 210 static inline
 211 void vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n)
 212 {
 213   /* ensure data is aligned to 16-bytes */
 214   while(n>0 && (int)data&15) {
 215     *data = MULT31(*data, *window);
 216     data++;
 217     window++;
 218     n--;
 219   }
 220   asm volatile ("movem.l (%[d]), %%d0-%%d3;"  /* loop start */
 221                 "movem.l (%[w]), %%a0-%%a3;"  /* pre-fetch registers */
 222                 "lea.l (4*4, %[w]), %[w];"
 223                 "bra 1f;"               /* jump to loop condition */
 224                 "0:" /* loop body */
 225                 /* multiply and load next window values */
 226                 "mac.l %%d0, %%a0, (%[w])+, %%a0, %%acc0;"
 227                 "mac.l %%d1, %%a1, (%[w])+, %%a1, %%acc1;"
 228                 "mac.l %%d2, %%a2, (%[w])+, %%a2, %%acc2;"
 229                 "mac.l %%d3, %%a3, (%[w])+, %%a3, %%acc3;"
 230                 "movclr.l %%acc0, %%d0;"  /* get the products */
 231                 "movclr.l %%acc1, %%d1;"
 232                 "movclr.l %%acc2, %%d2;"
 233                 "movclr.l %%acc3, %%d3;"
 234                 /* store and advance */
 235                 "movem.l %%d0-%%d3, (%[d]);"
 236                 "lea.l (4*4, %[d]), %[d];"
 237                 "movem.l (%[d]), %%d0-%%d3;"
 238                 "subq.l #4, %[n];"     /* done 4 elements */
 239                 "1: cmpi.l #4, %[n];"
 240                 "bge 0b;"
 241                 /* multiply final elements */
 242                 "tst.l %[n];"
 243                 "beq 1f;"      /* n=0 */
 244                 "mac.l %%d0, %%a0, %%acc0;"
 245                 "movclr.l %%acc0, %%d0;"
 246                 "move.l %%d0, (%[d])+;"
 247                 "subq.l #1, %[n];"
 248                 "beq 1f;"     /* n=1 */
 249                 "mac.l %%d1, %%a1, %%acc0;"
 250                 "movclr.l %%acc0, %%d1;"
 251                 "move.l %%d1, (%[d])+;"
 252                 "subq.l #1, %[n];"
 253                 "beq 1f;"     /* n=2 */
 254                 /* otherwise n = 3 */
 255                 "mac.l %%d2, %%a2, %%acc0;"
 256                 "movclr.l %%acc0, %%d2;"
 257                 "move.l %%d2, (%[d])+;"
 258                 "1:"
 259                 : [n] "+d" (n), [d] "+a" (data), [w] "+a" (window)
 260                 : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
 261                     "cc", "memory");
 262 }
 263
 264 static inline
 265 void vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n)
 266 {
 267   /* ensure at least data is aligned to 16-bytes */
 268   while(n>0 && (int)data&15) {
 269     *data = MULT31(*data, *window);
 270     data++;
 271     window--;
 272     n--;
 273   }
 274   asm volatile ("lea.l (-3*4, %[w]), %[w];"     /* loop start */
 275                 "movem.l (%[d]), %%d0-%%d3;"    /* pre-fetch registers */
 276                 "movem.l (%[w]), %%a0-%%a3;"
 277                 "bra 1f;"               /* jump to loop condition */
 278                 "0:" /* loop body */
 279                 /* multiply and load next window value */
 280                 "mac.l %%d0, %%a3, -(%[w]), %%a3, %%acc0;"
 281                 "mac.l %%d1, %%a2, -(%[w]), %%a2, %%acc1;"
 282                 "mac.l %%d2, %%a1, -(%[w]), %%a1, %%acc2;"
 283                 "mac.l %%d3, %%a0, -(%[w]), %%a0, %%acc3;"
 284                 "movclr.l %%acc0, %%d0;"  /* get the products */
 285                 "movclr.l %%acc1, %%d1;"
 286                 "movclr.l %%acc2, %%d2;"
 287                 "movclr.l %%acc3, %%d3;"
 288                 /* store and advance */
 289                 "movem.l %%d0-%%d3, (%[d]);"
 290                 "lea.l (4*4, %[d]), %[d];"
 291                 "movem.l (%[d]), %%d0-%%d3;"
 292                 "subq.l #4, %[n];"     /* done 4 elements */
 293                 "1: cmpi.l #4, %[n];"
 294                 "bge 0b;"
 295                 /* multiply final elements */
 296                 "tst.l %[n];"
 297                 "beq 1f;"      /* n=0 */
 298                 "mac.l %%d0, %%a3, %%acc0;"
 299                 "movclr.l %%acc0, %%d0;"
 300                 "move.l %%d0, (%[d])+;"
 301                 "subq.l #1, %[n];"
 302                 "beq 1f;"     /* n=1 */
 303                 "mac.l %%d1, %%a2, %%acc0;"
 304                 "movclr.l %%acc0, %%d1;"
 305                 "move.l %%d1, (%[d])+;"
 306                 "subq.l #1, %[n];"
 307                 "beq 1f;"     /* n=2 */
 308                 /* otherwise n = 3 */
 309                 "mac.l %%d2, %%a1, %%acc0;"
 310                 "movclr.l %%acc0, %%d2;"
 311                 "move.l %%d2, (%[d])+;"
 312                 "1:"
 313                 : [n] "+d" (n), [d] "+a" (data), [w] "+a" (window)
 314                 : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
 315                     "cc", "memory");
 316 }
 317
 318 #endif
 319
 320 #endif
 321
 322 #ifndef _V_CLIP_MATH
 323 #define _V_CLIP_MATH
 324
 325 /* this is portable C and simple; why not use this as default? */
 326 static inline ogg_int32_t CLIP_TO_15(register ogg_int32_t x) {
 327   register ogg_int32_t hi=32767, lo=-32768;
 328   return (x>=hi ? hi : (x<=lo ? lo : x));
 329 }
 330
 331 #endif
 332 #else
 333 #define LINE_ATTR
 334 #endif