apps/codecs/lib/asm_mcf5249.h

   1 /***************************************************************************
   2  *             __________               __   ___.
   3  *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
   4  *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
   5  *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
   6  *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
   7  *                     \/            \/     \/    \/            \/
   8  *
   9  * Copyright (C) 2005 by Pedro Vasconcelos
  10  *
  11  * This program is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU General Public License
  13  * as published by the Free Software Foundation; either version 2
  14  * of the License, or (at your option) any later version.
  15  *
  16  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  17  * KIND, either express or implied.
  18  *
  19  ****************************************************************************/
  20 /* asm routines for wide math on the MCF5249 */
  21
  22 #if defined(CPU_COLDFIRE)
  23
  24 /* attribute for 16-byte alignment */
  25 #define LINE_ATTR   __attribute__ ((aligned (16)))
  26
  27 #ifndef _V_WIDE_MATH
  28 #define _V_WIDE_MATH
  29
  30 static inline int32_t MULT32(int32_t x, int32_t y) {
  31
  32   asm volatile ("mac.l %[x], %[y], %%acc0;"    /* multiply & shift  */
  33                 "movclr.l %%acc0, %[x];"       /* move & clear acc */
  34                 "asr.l #1, %[x];"              /* no overflow test */
  35                 : [x] "+&d" (x)
  36                 : [y] "r" (y)
  37                 : "cc");
  38   return x;
  39 }
  40
  41 static inline int32_t MULT31(int32_t x, int32_t y) {
  42   asm volatile ("mac.l %[x], %[y], %%acc0;" /* multiply */
  43                 "movclr.l %%acc0, %[x];"    /* move and clear */
  44                 : [x] "+&r" (x)
  45                 : [y] "r" (y)
  46                 : "cc");
  47   return x;
  48 }
  49
  50 static inline int32_t MULT31_SHIFT15(int32_t x, int32_t y) {
  51   int32_t r;
  52
  53   asm volatile ("mac.l %[x], %[y], %%acc0;"  /* multiply */
  54                 "mulu.l %[y], %[x];"         /* get lower half, avoid emac stall */
  55                 "movclr.l %%acc0, %[r];"     /* get higher half */
  56                 "asl.l #8, %[r];"            /* hi<<16, plus one free */
  57                 "asl.l #8, %[r];"
  58                 "lsr.l #8, %[x];"            /* (unsigned)lo >> 15 */
  59                 "lsr.l #7, %[x];"
  60                 "or.l %[x], %[r];"           /* logical-or results */
  61                 : [r] "=&d" (r), [x] "+d" (x)
  62                 : [y] "d" (y)
  63                 : "cc");
  64   return r;
  65 }
  66
  67 static inline
  68 void XPROD31(int32_t  a, int32_t  b,
  69              int32_t  t, int32_t  v,
  70              int32_t *x, int32_t *y)
  71 {
  72   asm volatile ("mac.l %[a], %[t], %%acc0;"
  73                 "mac.l %[b], %[v], %%acc0;"
  74                 "mac.l %[b], %[t], %%acc1;"
  75                 "msac.l %[a], %[v], %%acc1;"
  76                 "movclr.l %%acc0, %[a];"
  77                 "move.l %[a], (%[x]);"
  78                 "movclr.l %%acc1, %[a];"
  79                 "move.l %[a], (%[y]);"
  80                 : [a] "+&r" (a)
  81                 : [x] "a" (x), [y] "a" (y),
  82                   [b] "r" (b), [t] "r" (t), [v] "r" (v)
  83                 : "cc", "memory");
  84 }
  85
  86 static inline
  87 void XNPROD31(int32_t  a, int32_t  b,
  88               int32_t  t, int32_t  v,
  89               int32_t *x, int32_t *y)
  90 {
  91   asm volatile ("mac.l %[a], %[t], %%acc0;"
  92                 "msac.l %[b], %[v], %%acc0;"
  93                 "mac.l %[b], %[t], %%acc1;"
  94                 "mac.l %[a], %[v], %%acc1;"
  95                 "movclr.l %%acc0, %[a];"
  96                 "move.l %[a], (%[x]);"
  97                 "movclr.l %%acc1, %[a];"
  98                 "move.l %[a], (%[y]);"
  99                 : [a] "+&r" (a)
 100                 : [x] "a" (x), [y] "a" (y),
 101                   [b] "r" (b), [t] "r" (t), [v] "r" (v)
 102                 : "cc", "memory");
 103 }
 104
 105 #if 0    /* canonical Tremor definition */
 106 #define XPROD32(_a, _b, _t, _v, _x, _y)         \
 107   { (_x)=MULT32(_a,_t)+MULT32(_b,_v);           \
 108     (_y)=MULT32(_b,_t)-MULT32(_a,_v); }
 109 #endif
 110
 111 /* this could lose the LSB by overflow, but i don't think it'll ever happen.
 112    if anyone think they can hear a bug caused by this, please try the above
 113    version. */
 114 #define XPROD32(_a, _b, _t, _v, _x, _y)     \
 115   asm volatile ("mac.l %[a], %[t], %%acc0;" \
 116                 "mac.l %[b], %[v], %%acc0;" \
 117                 "mac.l %[b], %[t], %%acc1;" \
 118                 "msac.l %[a], %[v], %%acc1;" \
 119                 "movclr.l %%acc0, %[x];" \
 120                 "asr.l #1, %[x];" \
 121                 "movclr.l %%acc1, %[y];" \
 122                 "asr.l #1, %[y];" \
 123                 : [x] "=&d" (_x), [y] "=&d" (_y) \
 124                 : [a] "r" (_a), [b] "r" (_b), \
 125                   [t] "r" (_t), [v] "r" (_v) \
 126                 : "cc");
 127
 128 #define XPROD31_R(_a, _b, _t, _v, _x, _y)   \
 129   asm volatile ("mac.l %[a], %[t], %%acc0;" \
 130                 "mac.l %[b], %[v], %%acc0;" \
 131                 "mac.l %[b], %[t], %%acc1;" \
 132                 "msac.l %[a], %[v], %%acc1;" \
 133                 "movclr.l %%acc0, %[x];" \
 134                 "movclr.l %%acc1, %[y];" \
 135                 : [x] "=&d" (_x), [y] "=&d" (_y) \
 136                 : [a] "r" (_a), [b] "r" (_b), \
 137                   [t] "r" (_t), [v] "r" (_v) \
 138                 : "cc");
 139
 140 #define XNPROD31_R(_a, _b, _t, _v, _x, _y)  \
 141   asm volatile ("mac.l %[a], %[t], %%acc0;" \
 142                 "msac.l %[b], %[v], %%acc0;" \
 143                 "mac.l %[b], %[t], %%acc1;" \
 144                 "mac.l %[a], %[v], %%acc1;" \
 145                 "movclr.l %%acc0, %[x];" \
 146                 "movclr.l %%acc1, %[y];" \
 147                 : [x] "=&d" (_x), [y] "=&d" (_y) \
 148                 : [a] "r" (_a), [b] "r" (_b), \
 149                   [t] "r" (_t), [v] "r" (_v) \
 150                 : "cc");
 151
 152 #ifndef _V_VECT_OPS
 153 #define _V_VECT_OPS
 154
 155 /* asm versions of vector operations for block.c, window.c */
 156 /* assumes MAC is initialized & accumulators cleared */
 157 static inline
 158 void vect_add(int32_t *x, const int32_t *y, int n)
 159 {
 160   /* align to 16 bytes */
 161   while(n>0 && (int)x&15) {
 162     *x++ += *y++;
 163     n--;
 164   }
 165   asm volatile ("bra 1f;"
 166                 "0:"                          /* loop start */
 167                 "movem.l (%[x]), %%d0-%%d3;"  /* fetch values */
 168                 "movem.l (%[y]), %%a0-%%a3;"
 169                 /* add */
 170                 "add.l %%a0, %%d0;"
 171                 "add.l %%a1, %%d1;"
 172                 "add.l %%a2, %%d2;"
 173                 "add.l %%a3, %%d3;"
 174                 /* store and advance */
 175                 "movem.l %%d0-%%d3, (%[x]);"
 176                 "lea.l (4*4, %[x]), %[x];"
 177                 "lea.l (4*4, %[y]), %[y];"
 178                 "subq.l #4, %[n];"     /* done 4 elements */
 179                 "1: cmpi.l #4, %[n];"
 180                 "bge 0b;"
 181                 : [n] "+d" (n), [x] "+a" (x), [y] "+a" (y)
 182                 : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
 183                     "cc", "memory");
 184   /* add final elements */
 185   while (n>0) {
 186     *x++ += *y++;
 187     n--;
 188   }
 189 }
 190
 191 static inline
 192 void vect_copy(int32_t *x, int32_t *y, int n)
 193 {
 194   /* align to 16 bytes */
 195   while(n>0 && (int)x&15) {
 196     *x++ = *y++;
 197     n--;
 198   }
 199   asm volatile ("bra 1f;"
 200                 "0:"                                    /* loop start */
 201                 "movem.l (%[y]), %%d0-%%d3;"            /* fetch values */
 202                 "movem.l %%d0-%%d3, (%[x]);"            /* store */
 203                 "lea.l (4*4, %[x]), %[x];"              /* advance */
 204                 "lea.l (4*4, %[y]), %[y];"
 205                 "subq.l #4, %[n];"                      /* done 4 elements */
 206                 "1: cmpi.l #4, %[n];"
 207                 "bge 0b;"
 208                 : [n] "+d" (n), [x] "+a" (x), [y] "+a" (y)
 209                 : : "%d0", "%d1", "%d2", "%d3", "cc", "memory");
 210   /* copy final elements */
 211   while (n>0) {
 212     *x++ = *y++;
 213     n--;
 214   }
 215 }
 216
 217 static inline
 218 void vect_mult_fw(int32_t *data, int32_t *window, int n)
 219 {
 220   /* ensure data is aligned to 16-bytes */
 221   while(n>0 && (int)data&15) {
 222     *data = MULT31(*data, *window);
 223     data++;
 224     window++;
 225     n--;
 226   }
 227   asm volatile ("movem.l (%[d]), %%d0-%%d3;"  /* loop start */
 228                 "movem.l (%[w]), %%a0-%%a3;"  /* pre-fetch registers */
 229                 "lea.l (4*4, %[w]), %[w];"
 230                 "bra 1f;"               /* jump to loop condition */
 231                 "0:" /* loop body */
 232                 /* multiply and load next window values */
 233                 "mac.l %%d0, %%a0, (%[w])+, %%a0, %%acc0;"
 234                 "mac.l %%d1, %%a1, (%[w])+, %%a1, %%acc1;"
 235                 "mac.l %%d2, %%a2, (%[w])+, %%a2, %%acc2;"
 236                 "mac.l %%d3, %%a3, (%[w])+, %%a3, %%acc3;"
 237                 "movclr.l %%acc0, %%d0;"  /* get the products */
 238                 "movclr.l %%acc1, %%d1;"
 239                 "movclr.l %%acc2, %%d2;"
 240                 "movclr.l %%acc3, %%d3;"
 241                 /* store and advance */
 242                 "movem.l %%d0-%%d3, (%[d]);"
 243                 "lea.l (4*4, %[d]), %[d];"
 244                 "movem.l (%[d]), %%d0-%%d3;"
 245                 "subq.l #4, %[n];"     /* done 4 elements */
 246                 "1: cmpi.l #4, %[n];"
 247                 "bge 0b;"
 248                 /* multiply final elements */
 249                 "tst.l %[n];"
 250                 "beq 1f;"      /* n=0 */
 251                 "mac.l %%d0, %%a0, %%acc0;"
 252                 "movclr.l %%acc0, %%d0;"
 253                 "move.l %%d0, (%[d])+;"
 254                 "subq.l #1, %[n];"
 255                 "beq 1f;"     /* n=1 */
 256                 "mac.l %%d1, %%a1, %%acc0;"
 257                 "movclr.l %%acc0, %%d1;"
 258                 "move.l %%d1, (%[d])+;"
 259                 "subq.l #1, %[n];"
 260                 "beq 1f;"     /* n=2 */
 261                 /* otherwise n = 3 */
 262                 "mac.l %%d2, %%a2, %%acc0;"
 263                 "movclr.l %%acc0, %%d2;"
 264                 "move.l %%d2, (%[d])+;"
 265                 "1:"
 266                 : [n] "+d" (n), [d] "+a" (data), [w] "+a" (window)
 267                 : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
 268                     "cc", "memory");
 269 }
 270
 271 static inline
 272 void vect_mult_bw(int32_t *data, int32_t *window, int n)
 273 {
 274   /* ensure at least data is aligned to 16-bytes */
 275   while(n>0 && (int)data&15) {
 276     *data = MULT31(*data, *window);
 277     data++;
 278     window--;
 279     n--;
 280   }
 281   asm volatile ("lea.l (-3*4, %[w]), %[w];"     /* loop start */
 282                 "movem.l (%[d]), %%d0-%%d3;"    /* pre-fetch registers */
 283                 "movem.l (%[w]), %%a0-%%a3;"
 284                 "bra 1f;"               /* jump to loop condition */
 285                 "0:" /* loop body */
 286                 /* multiply and load next window value */
 287                 "mac.l %%d0, %%a3, -(%[w]), %%a3, %%acc0;"
 288                 "mac.l %%d1, %%a2, -(%[w]), %%a2, %%acc1;"
 289                 "mac.l %%d2, %%a1, -(%[w]), %%a1, %%acc2;"
 290                 "mac.l %%d3, %%a0, -(%[w]), %%a0, %%acc3;"
 291                 "movclr.l %%acc0, %%d0;"  /* get the products */
 292                 "movclr.l %%acc1, %%d1;"
 293                 "movclr.l %%acc2, %%d2;"
 294                 "movclr.l %%acc3, %%d3;"
 295                 /* store and advance */
 296                 "movem.l %%d0-%%d3, (%[d]);"
 297                 "lea.l (4*4, %[d]), %[d];"
 298                 "movem.l (%[d]), %%d0-%%d3;"
 299                 "subq.l #4, %[n];"     /* done 4 elements */
 300                 "1: cmpi.l #4, %[n];"
 301                 "bge 0b;"
 302                 /* multiply final elements */
 303                 "tst.l %[n];"
 304                 "beq 1f;"      /* n=0 */
 305                 "mac.l %%d0, %%a3, %%acc0;"
 306                 "movclr.l %%acc0, %%d0;"
 307                 "move.l %%d0, (%[d])+;"
 308                 "subq.l #1, %[n];"
 309                 "beq 1f;"     /* n=1 */
 310                 "mac.l %%d1, %%a2, %%acc0;"
 311                 "movclr.l %%acc0, %%d1;"
 312                 "move.l %%d1, (%[d])+;"
 313                 "subq.l #1, %[n];"
 314                 "beq 1f;"     /* n=2 */
 315                 /* otherwise n = 3 */
 316                 "mac.l %%d2, %%a1, %%acc0;"
 317                 "movclr.l %%acc0, %%d2;"
 318                 "move.l %%d2, (%[d])+;"
 319                 "1:"
 320                 : [n] "+d" (n), [d] "+a" (data), [w] "+a" (window)
 321                 : : "%d0", "%d1", "%d2", "%d3", "%a0", "%a1", "%a2", "%a3",
 322                     "cc", "memory");
 323 }
 324
 325 #endif
 326
 327 #endif
 328 /* not used anymore */
 329 /*
 330 #ifndef _V_CLIP_MATH
 331 #define _V_CLIP_MATH
 332
 333 * this is portable C and simple; why not use this as default?
 334 static inline int32_t CLIP_TO_15(register int32_t x) {
 335   register int32_t hi=32767, lo=-32768;
 336   return (x>=hi ? hi : (x<=lo ? lo : x));
 337 }
 338
 339 #endif
 340 */
 341 #else
 342 #define LINE_ATTR
 343 #endif
 344