apps/codecs/demac/libdemac/vector_math16_cf.h

   1 /*
   2
   3 libdemac - A Monkey's Audio decoder
   4
   5 $Id$
   6
   7 Copyright (C) Dave Chapman 2007
   8
   9 Coldfire vector math copyright (C) 2007 Jens Arnold
  10
  11 This program is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2 of the License, or
  14 (at your option) any later version.
  15
  16 This program is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with this program; if not, write to the Free Software
  23 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
  24
  25 */
  26
  27 #define FUSED_VECTOR_MATH
  28
  29 #define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */
  30
  31 /* Calculate scalarproduct, then add a 2nd vector (fused for performance)
  32  * This version fetches data as 32 bit words, and *recommends* v1 to be
  33  * 32 bit aligned. It also assumes that f2 and s2 are either both 32 bit
  34  * aligned or both unaligned. Performance will suffer if either condition
  35  * isn't met. It also needs EMAC in signed integer mode. */
  36 static inline int32_t vector_sp_add(int16_t* v1, int16_t* f2, int16_t* s2)
  37 {
  38     int res;
  39 #if ORDER > 16
  40     int cnt = ORDER>>4;
  41 #endif
  42
  43 #define ADDHALFREGS(s1, s2, sum)       /* Add register halves straight. */  \
  44         "move.l " #s1  ", " #sum "\n"  /* 's1' and 's2' can be A or D */    \
  45         "add.l  " #s2  ", " #s1  "\n"  /* regs, 'sum' must be a D reg. */   \
  46         "clr.w  " #sum "          \n"  /* 's1' is clobbered! */             \
  47         "add.l  " #s2  ", " #sum "\n"  \
  48         "move.w " #s1  ", " #sum "\n"
  49
  50 #define ADDHALFXREGS(s1, s2, sum)      /* Add register halves across. */    \
  51         "clr.w  " #sum "          \n"  /* Needs 'sum' pre-swapped, swaps */ \
  52         "add.l  " #s1  ", " #sum "\n"  /* 's2', and clobbers 's1'. */       \
  53         "swap   " #s2  "          \n"  /* 's1' can be an A or D reg. */     \
  54         "add.l  " #s2  ", " #s1  "\n"  /* 'sum' and 's2' must be D regs. */ \
  55         "move.w " #s1  ", " #sum "\n"
  56
  57     asm volatile (
  58         "move.l  %[f2], %%d0                         \n"
  59         "and.l   #2, %%d0                            \n"
  60         "jeq     20f                                 \n"
  61
  62     "10:                                             \n"
  63         "move.w  (%[f2])+, %%d0                      \n"
  64         "move.w  (%[s2])+, %%d1                      \n"
  65         "swap    %%d1                                \n"
  66     "1:                                              \n"
  67         ".rept   2                                   \n"
  68         "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1        \n"
  69         "mac.w   %%d0l, %%d6u, (%[f2])+, %%d0, %%acc0\n"
  70         "mac.w   %%d0u, %%d6l, (%[s2])+, %%d2, %%acc0\n"
  71         ADDHALFXREGS(%%d6, %%d2, %%d1)
  72         "mac.w   %%d0l, %%d7u, (%[f2])+, %%d0, %%acc0\n"
  73         "mac.w   %%d0u, %%d7l, (%[s2])+, %%d6, %%acc0\n"
  74         "move.l  %%d1, (%[v1])+                      \n"
  75         ADDHALFXREGS(%%d7, %%d6, %%d2)
  76         "mac.w   %%d0l, %%a0u, (%[f2])+, %%d0, %%acc0\n"
  77         "mac.w   %%d0u, %%a0l, (%[s2])+, %%d7, %%acc0\n"
  78         "move.l  %%d2, (%[v1])+                      \n"
  79         ADDHALFXREGS(%%a0, %%d7, %%d6)
  80         "mac.w   %%d0l, %%a1u, (%[f2])+, %%d0, %%acc0\n"
  81         "mac.w   %%d0u, %%a1l, (%[s2])+, %%d1, %%acc0\n"
  82         "move.l  %%d6, (%[v1])+                      \n"
  83         ADDHALFXREGS(%%a1, %%d1, %%d7)
  84         "move.l  %%d7, (%[v1])+                      \n"
  85         ".endr                                       \n"
  86
  87 #if ORDER > 16
  88         "subq.l  #1, %[res]                          \n"
  89         "bne.w   1b                                  \n"
  90 #endif
  91         "jra     99f                                 \n"
  92
  93     "20:                                             \n"
  94         "move.l  (%[f2])+, %%d0                      \n"
  95     "1:                                              \n"
  96         "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1        \n"
  97         "mac.w   %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
  98         "mac.w   %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
  99         ADDHALFREGS(%%d6, %%d1, %%d2)
 100         "mac.w   %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
 101         "mac.w   %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
 102         "move.l  %%d2, (%[v1])+                      \n"
 103         ADDHALFREGS(%%d7, %%d1, %%d2)
 104         "mac.w   %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
 105         "mac.w   %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
 106         "move.l  %%d2, (%[v1])+                      \n"
 107         ADDHALFREGS(%%a0, %%d1, %%d2)
 108         "mac.w   %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
 109         "mac.w   %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
 110         "move.l  %%d2, (%[v1])+                      \n"
 111         ADDHALFREGS(%%a1, %%d1, %%d2)
 112         "move.l  %%d2, (%[v1])+                      \n"
 113
 114         "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1        \n"
 115         "mac.w   %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
 116         "mac.w   %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
 117         ADDHALFREGS(%%d6, %%d1, %%d2)
 118         "mac.w   %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
 119         "mac.w   %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
 120         "move.l  %%d2, (%[v1])+                      \n"
 121         ADDHALFREGS(%%d7, %%d1, %%d2)
 122         "mac.w   %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
 123         "mac.w   %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
 124         "move.l  %%d2, (%[v1])+                      \n"
 125         ADDHALFREGS(%%a0, %%d1, %%d2)
 126         "mac.w   %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
 127 #if ORDER > 16
 128         "mac.w   %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
 129 #else
 130         "mac.w   %%d0l, %%a1l, %%acc0                \n"
 131 #endif
 132         "move.l  %%d2, (%[v1])+                      \n"
 133         ADDHALFREGS(%%a1, %%d1, %%d2)
 134         "move.l  %%d2, (%[v1])+                      \n"
 135 #if ORDER > 16
 136         "subq.l  #1, %[res]                          \n"
 137         "bne.w   1b                                  \n"
 138 #endif
 139
 140     "99:                                             \n"
 141         "movclr.l %%acc0, %[res]                     \n"
 142         : /* outputs */
 143         [v1]"+a"(v1),
 144         [f2]"+a"(f2),
 145         [s2]"+a"(s2),
 146         [res]"=d"(res)
 147         : /* inputs */
 148 #if ORDER > 16
 149         [cnt]"[res]"(cnt)
 150 #endif
 151         : /* clobbers */
 152         "d0", "d1", "d2", "d6", "d7",
 153         "a0", "a1", "memory"
 154
 155     );
 156     return res;
 157 }
 158
 159 /* Calculate scalarproduct, then subtract a 2nd vector (fused for performance)
 160  * This version fetches data as 32 bit words, and *recommends* v1 to be
 161  * 32 bit aligned. It also assumes that f2 and s2 are either both 32 bit
 162  * aligned or both unaligned. Performance will suffer if either condition
 163  * isn't met. It also needs EMAC in signed integer mode. */
 164 static inline int32_t vector_sp_sub(int16_t* v1, int16_t* f2, int16_t* s2)
 165 {
 166     int res;
 167 #if ORDER > 16
 168     int cnt = ORDER>>4;
 169 #endif
 170
 171 #define SUBHALFREGS(min, sub, dif)    /* Subtract register halves straight. */ \
 172         "move.l " #min ", " #dif "\n" /* 'min' can be an A or D reg */         \
 173         "sub.l  " #sub ", " #min "\n" /* 'sub' and 'dif' must be D regs */     \
 174         "clr.w  " #sub           "\n" /* 'min' and 'sub' are clobbered! */     \
 175         "sub.l  " #sub ", " #dif "\n" \
 176         "move.w " #min ", " #dif "\n"
 177
 178 #define SUBHALFXREGS(min, s2, s1d)    /* Subtract register halves across. */ \
 179         "clr.w  " #s1d           "\n" /* Needs 's1d' pre-swapped, swaps */   \
 180         "sub.l  " #s1d ", " #min "\n" /* 's2' and clobbers 'min'. */         \
 181         "move.l " #min ", " #s1d "\n" /* 'min' can be an A or D reg, */      \
 182         "swap   " #s2            "\n" /* 's2' and 's1d' must be D regs. */   \
 183         "sub.l  " #s2  ", " #min "\n" \
 184         "move.w " #min ", " #s1d "\n"
 185
 186     asm volatile (
 187         "move.l  %[f2], %%d0                         \n"
 188         "and.l   #2, %%d0                            \n"
 189         "jeq     20f                                 \n"
 190
 191     "10:                                             \n"
 192         "move.w  (%[f2])+, %%d0                      \n"
 193         "move.w  (%[s2])+, %%d1                      \n"
 194         "swap    %%d1                                \n"
 195     "1:                                              \n"
 196         ".rept   2                                   \n"
 197         "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1        \n"
 198         "mac.w   %%d0l, %%d6u, (%[f2])+, %%d0, %%acc0\n"
 199         "mac.w   %%d0u, %%d6l, (%[s2])+, %%d2, %%acc0\n"
 200         SUBHALFXREGS(%%d6, %%d2, %%d1)
 201         "mac.w   %%d0l, %%d7u, (%[f2])+, %%d0, %%acc0\n"
 202         "mac.w   %%d0u, %%d7l, (%[s2])+, %%d6, %%acc0\n"
 203         "move.l  %%d1, (%[v1])+                      \n"
 204         SUBHALFXREGS(%%d7, %%d6, %%d2)
 205         "mac.w   %%d0l, %%a0u, (%[f2])+, %%d0, %%acc0\n"
 206         "mac.w   %%d0u, %%a0l, (%[s2])+, %%d7, %%acc0\n"
 207         "move.l  %%d2, (%[v1])+                      \n"
 208         SUBHALFXREGS(%%a0, %%d7, %%d6)
 209         "mac.w   %%d0l, %%a1u, (%[f2])+, %%d0, %%acc0\n"
 210         "mac.w   %%d0u, %%a1l, (%[s2])+, %%d1, %%acc0\n"
 211         "move.l  %%d6, (%[v1])+                      \n"
 212         SUBHALFXREGS(%%a1, %%d1, %%d7)
 213         "move.l  %%d7, (%[v1])+                      \n"
 214         ".endr                                       \n"
 215
 216 #if ORDER > 16
 217         "subq.l  #1, %[res]                          \n"
 218         "bne.w   1b                                  \n"
 219 #endif
 220
 221         "jra     99f                                 \n"
 222
 223     "20:                                             \n"
 224         "move.l  (%[f2])+, %%d0                      \n"
 225     "1:                                              \n"
 226         "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1        \n"
 227         "mac.w   %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
 228         "mac.w   %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
 229         SUBHALFREGS(%%d6, %%d1, %%d2)
 230         "mac.w   %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
 231         "mac.w   %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
 232         "move.l  %%d2, (%[v1])+                      \n"
 233         SUBHALFREGS(%%d7, %%d1, %%d2)
 234         "mac.w   %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
 235         "mac.w   %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
 236         "move.l  %%d2, (%[v1])+                      \n"
 237         SUBHALFREGS(%%a0, %%d1, %%d2)
 238         "mac.w   %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
 239         "mac.w   %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
 240         "move.l  %%d2, (%[v1])+                      \n"
 241         SUBHALFREGS(%%a1, %%d1, %%d2)
 242         "move.l  %%d2, (%[v1])+                      \n"
 243
 244         "movem.l (%[v1]), %%d6-%%d7/%%a0-%%a1        \n"
 245         "mac.w   %%d0u, %%d6u, (%[s2])+, %%d1, %%acc0\n"
 246         "mac.w   %%d0l, %%d6l, (%[f2])+, %%d0, %%acc0\n"
 247         SUBHALFREGS(%%d6, %%d1, %%d2)
 248         "mac.w   %%d0u, %%d7u, (%[s2])+, %%d1, %%acc0\n"
 249         "mac.w   %%d0l, %%d7l, (%[f2])+, %%d0, %%acc0\n"
 250         "move.l  %%d2, (%[v1])+                      \n"
 251         SUBHALFREGS(%%d7, %%d1, %%d2)
 252         "mac.w   %%d0u, %%a0u, (%[s2])+, %%d1, %%acc0\n"
 253         "mac.w   %%d0l, %%a0l, (%[f2])+, %%d0, %%acc0\n"
 254         "move.l  %%d2, (%[v1])+                      \n"
 255         SUBHALFREGS(%%a0, %%d1, %%d2)
 256         "mac.w   %%d0u, %%a1u, (%[s2])+, %%d1, %%acc0\n"
 257 #if ORDER > 16
 258         "mac.w   %%d0l, %%a1l, (%[f2])+, %%d0, %%acc0\n"
 259 #else
 260         "mac.w   %%d0l, %%a1l, %%acc0                \n"
 261 #endif
 262         "move.l  %%d2, (%[v1])+                      \n"
 263         SUBHALFREGS(%%a1, %%d1, %%d2)
 264         "move.l  %%d2, (%[v1])+                      \n"
 265 #if ORDER > 16
 266         "subq.l  #1, %[res]                          \n"
 267         "bne.w   1b                                  \n"
 268 #endif
 269
 270     "99:                                             \n"
 271         "movclr.l %%acc0, %[res]                     \n"
 272         : /* outputs */
 273         [v1]"+a"(v1),
 274         [f2]"+a"(f2),
 275         [s2]"+a"(s2),
 276         [res]"=d"(res)
 277         : /* inputs */
 278 #if ORDER > 16
 279         [cnt]"[res]"(cnt)
 280 #endif
 281         : /* clobbers */
 282         "d0", "d1", "d2", "d6", "d7",
 283         "a0", "a1", "memory"
 284
 285     );
 286     return res;
 287 }
 288
 289 /* This version fetches data as 32 bit words, and *recommends* v1 to be
 290  * 32 bit aligned, otherwise performance will suffer. It also needs EMAC
 291  * in signed integer mode. */
 292 static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
 293 {
 294     int res;
 295 #if ORDER > 16
 296     int cnt = ORDER>>4;
 297 #endif
 298
 299     asm volatile (
 300         "move.l  %[v2], %%d0                         \n"
 301         "and.l   #2, %%d0                            \n"
 302         "jeq     20f                                 \n"
 303
 304     "10:                                             \n"
 305         "move.l  (%[v1])+, %%d0                      \n"
 306         "move.w  (%[v2])+, %%d1                      \n"
 307     "1:                                              \n"
 308         ".rept   7                                   \n"
 309         "mac.w   %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
 310         "mac.w   %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
 311         ".endr                                       \n"
 312
 313         "mac.w   %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
 314 #if ORDER > 16
 315         "mac.w   %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
 316         "subq.l  #1, %[res]                          \n"
 317         "bne.b   1b                                  \n"
 318 #else
 319         "mac.w   %%d0l, %%d1u, %%acc0                \n"
 320 #endif
 321         "jra     99f                                  \n"
 322
 323     "20:                                             \n"
 324         "move.l  (%[v1])+, %%d0                      \n"
 325         "move.l  (%[v2])+, %%d1                      \n"
 326     "1:                                              \n"
 327         ".rept   3                                   \n"
 328         "mac.w   %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
 329         "mac.w   %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
 330         "mac.w   %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
 331         "mac.w   %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
 332         ".endr                                       \n"
 333
 334         "mac.w   %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
 335         "mac.w   %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
 336 #if ORDER > 16
 337         "mac.w   %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
 338         "mac.w   %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
 339         "subq.l  #1, %[res]                          \n"
 340         "bne.b   1b                                  \n"
 341 #else
 342         "mac.w   %%d2u, %%d1u, %%acc0                \n"
 343         "mac.w   %%d2l, %%d1l, %%acc0                \n"
 344 #endif
 345
 346     "99:                                             \n"
 347         "movclr.l %%acc0, %[res]                     \n"
 348         : /* outputs */
 349         [v1]"+a"(v1),
 350         [v2]"+a"(v2),
 351         [res]"=d"(res)
 352         : /* inputs */
 353 #if ORDER > 16
 354         [cnt]"[res]"(cnt)
 355 #endif
 356         : /* clobbers */
 357         "d0", "d1", "d2"
 358     );
 359     return res;
 360 }