apps/codecs/demac/libdemac/vector_math16_cf.h

   1 /*
   2
   3 libdemac - A Monkey's Audio decoder
   4
   5 $Id$
   6
   7 Copyright (C) Dave Chapman 2007
   8
   9 Coldfire vector math copyright (C) 2007 Jens Arnold
  10
  11 This program is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2 of the License, or
  14 (at your option) any later version.
  15
  16 This program is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with this program; if not, write to the Free Software
  23 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA
  24
  25 */
  26
  27 /* This version fetches data as 32 bit words, and *recommends* v1 to be
  28  * 32 bit aligned, otherwise performance will suffer. */
  29 static inline void vector_add(int16_t* v1, int16_t* v2)
  30 {
  31 #if ORDER > 16
  32     int cnt = ORDER>>4;
  33 #endif
  34
  35 #define ADDHALFREGS(s1, sum)           /* Add register halves straight. */ \
  36         "move.l " #s1  ",   %%d4  \n"  /* 's1' can be an A or D reg. */    \
  37         "add.l  " #sum ", " #s1  "\n"  /* 'sum' must be a D reg. */        \
  38         "clr.w    %%d4            \n"  /* 's1' and %%d4 are clobbered! */  \
  39         "add.l    %%d4  , " #sum "\n"  \
  40         "move.w " #s1  ", " #sum "\n"
  41
  42 #define ADDHALFXREGS(s1, s2, sum)      /* Add register halves across. */    \
  43         "clr.w  " #sum "          \n"  /* Needs 'sum' pre-swapped, swaps */ \
  44         "add.l  " #s1  ", " #sum "\n"  /* 's2', and clobbers 's1'. */       \
  45         "swap   " #s2  "          \n"  /* 's1' can be an A or D reg. */     \
  46         "add.l  " #s2  ", " #s1  "\n"  /* 'sum' and 's2' must be D regs. */ \
  47         "move.w " #s1  ", " #sum "\n"
  48
  49     asm volatile (
  50         "move.l  %[v2], %%d0         \n"
  51         "and.l   #2, %%d0            \n"
  52         "jeq     20f                 \n"
  53
  54     "10:                             \n"
  55         "move.w  (%[v2])+, %%d0      \n"
  56         "swap    %%d0                \n"
  57     "1:                              \n"
  58         "movem.l (%[v1]), %%a0-%%a3  \n"
  59         "movem.l (%[v2]), %%d1-%%d4  \n"
  60         ADDHALFXREGS(%%a0, %%d1, %%d0)
  61         "move.l  %%d0, (%[v1])+      \n"
  62         ADDHALFXREGS(%%a1, %%d2, %%d1)
  63         "move.l  %%d1, (%[v1])+      \n"
  64         ADDHALFXREGS(%%a2, %%d3, %%d2)
  65         "move.l  %%d2, (%[v1])+      \n"
  66         ADDHALFXREGS(%%a3, %%d4, %%d3)
  67         "move.l  %%d3, (%[v1])+      \n"
  68         "lea.l   (16, %[v2]), %[v2]  \n"
  69         "move.l  %%d4, %%d0          \n"
  70
  71         "movem.l (%[v1]), %%a0-%%a3  \n"
  72         "movem.l (%[v2]), %%d1-%%d4  \n"
  73         ADDHALFXREGS(%%a0, %%d1, %%d0)
  74         "move.l  %%d0, (%[v1])+      \n"
  75         ADDHALFXREGS(%%a1, %%d2, %%d1)
  76         "move.l  %%d1, (%[v1])+      \n"
  77         ADDHALFXREGS(%%a2, %%d3, %%d2)
  78         "move.l  %%d2, (%[v1])+      \n"
  79         ADDHALFXREGS(%%a3, %%d4, %%d3)
  80         "move.l  %%d3, (%[v1])+      \n"
  81 #if ORDER > 16
  82         "lea.l   (16, %[v2]), %[v2]  \n"
  83         "move.l  %%d4, %%d0          \n"
  84
  85         "subq.l  #1, %[cnt]          \n"
  86         "jne     1b                  \n"
  87 #endif
  88         "jra     99f                 \n"
  89
  90     "20:                             \n"
  91     "1:                              \n"
  92         "movem.l (%[v2]), %%a0-%%a3  \n"
  93         "movem.l (%[v1]), %%d0-%%d3  \n"
  94         ADDHALFREGS(%%a0, %%d0)
  95         "move.l  %%d0, (%[v1])+      \n"
  96         ADDHALFREGS(%%a1, %%d1)
  97         "move.l  %%d1, (%[v1])+      \n"
  98         ADDHALFREGS(%%a2, %%d2)
  99         "move.l  %%d2, (%[v1])+      \n"
 100         ADDHALFREGS(%%a3, %%d3)
 101         "move.l  %%d3, (%[v1])+      \n"
 102         "lea.l   (16, %[v2]), %[v2]  \n"
 103
 104         "movem.l (%[v2]), %%a0-%%a3  \n"
 105         "movem.l (%[v1]), %%d0-%%d3  \n"
 106         ADDHALFREGS(%%a0, %%d0)
 107         "move.l  %%d0, (%[v1])+      \n"
 108         ADDHALFREGS(%%a1, %%d1)
 109         "move.l  %%d1, (%[v1])+      \n"
 110         ADDHALFREGS(%%a2, %%d2)
 111         "move.l  %%d2, (%[v1])+      \n"
 112         ADDHALFREGS(%%a3, %%d3)
 113         "move.l  %%d3, (%[v1])+      \n"
 114 #if ORDER > 16
 115         "lea.l   (16, %[v2]), %[v2]  \n"
 116
 117         "subq.l  #1, %[cnt]          \n"
 118         "jne     1b                  \n"
 119 #endif
 120     "99:                             \n"
 121         : /* outputs */
 122 #if ORDER > 16
 123         [cnt]"+d"(cnt),
 124 #endif
 125         [v1] "+a"(v1),
 126         [v2] "+a"(v2)
 127         : /* inputs */
 128         : /* clobbers */
 129         "d0", "d1", "d2", "d3", "d4",
 130         "a0", "a1", "a2", "a3", "memory"
 131     );
 132 }
 133
 134 /* This version fetches data as 32 bit words, and *recommends* v1 to be
 135  * 32 bit aligned, otherwise performance will suffer. */
 136 static inline void vector_sub(int16_t* v1, int16_t* v2)
 137 {
 138 #if ORDER > 16
 139     int cnt = ORDER>>4;
 140 #endif
 141
 142 #define SUBHALFREGS(min, sub, dif)    /* Subtract register halves straight. */ \
 143         "move.l " #min ", " #dif "\n" /* 'min' can be an A or D reg */         \
 144         "sub.l  " #sub ", " #min "\n" /* 'sub' and 'dif' must be D regs */     \
 145         "clr.w  " #sub           "\n" /* 'min' and 'sub' are clobbered! */     \
 146         "sub.l  " #sub ", " #dif "\n" \
 147         "move.w " #min ", " #dif "\n"
 148
 149 #define SUBHALFXREGS(min, s2, s1d)    /* Subtract register halves across. */ \
 150         "clr.w  " #s1d           "\n" /* Needs 's1d' pre-swapped, swaps */   \
 151         "sub.l  " #s1d ", " #min "\n" /* 's2' and clobbers 'min'. */         \
 152         "move.l " #min ", " #s1d "\n" /* 'min' can be an A or D reg, */      \
 153         "swap   " #s2            "\n" /* 's2' and 's1d' must be D regs. */   \
 154         "sub.l  " #s2  ", " #min "\n" \
 155         "move.w " #min ", " #s1d "\n"
 156
 157     asm volatile (
 158         "move.l  %[v2], %%d0         \n"
 159         "and.l   #2, %%d0            \n"
 160         "jeq     20f                 \n"
 161
 162     "10:                             \n"
 163         "move.w  (%[v2])+, %%d0      \n"
 164         "swap    %%d0                \n"
 165     "1:                              \n"
 166         "movem.l (%[v2]), %%d1-%%d4  \n"
 167         "movem.l (%[v1]), %%a0-%%a3  \n"
 168         SUBHALFXREGS(%%a0, %%d1, %%d0)
 169         "move.l  %%d0, (%[v1])+      \n"
 170         SUBHALFXREGS(%%a1, %%d2, %%d1)
 171         "move.l  %%d1, (%[v1])+      \n"
 172         SUBHALFXREGS(%%a2, %%d3, %%d2)
 173         "move.l  %%d2, (%[v1])+      \n"
 174         SUBHALFXREGS(%%a3, %%d4, %%d3)
 175         "move.l  %%d3, (%[v1])+      \n"
 176         "lea.l   (16, %[v2]), %[v2]  \n"
 177         "move.l  %%d4, %%d0          \n"
 178
 179         "movem.l (%[v2]), %%d1-%%d4  \n"
 180         "movem.l (%[v1]), %%a0-%%a3  \n"
 181         SUBHALFXREGS(%%a0, %%d1, %%d0)
 182         "move.l  %%d0, (%[v1])+      \n"
 183         SUBHALFXREGS(%%a1, %%d2, %%d1)
 184         "move.l  %%d1, (%[v1])+      \n"
 185         SUBHALFXREGS(%%a2, %%d3, %%d2)
 186         "move.l  %%d2, (%[v1])+      \n"
 187         SUBHALFXREGS(%%a3, %%d4, %%d3)
 188         "move.l  %%d3, (%[v1])+      \n"
 189 #if ORDER > 16
 190         "lea.l   (16, %[v2]), %[v2]  \n"
 191         "move.l  %%d4, %%d0          \n"
 192
 193         "subq.l  #1, %[cnt]          \n"
 194         "bne.w   1b                  \n"
 195 #endif
 196         "jra     99f                 \n"
 197
 198     "20:                             \n"
 199     "1:                              \n"
 200         "movem.l (%[v2]), %%d1-%%d4  \n"
 201         "movem.l (%[v1]), %%a0-%%a3  \n"
 202         SUBHALFREGS(%%a0, %%d1, %%d0)
 203         "move.l  %%d0, (%[v1])+      \n"
 204         SUBHALFREGS(%%a1, %%d2, %%d1)
 205         "move.l  %%d1, (%[v1])+      \n"
 206         SUBHALFREGS(%%a2, %%d3, %%d2)
 207         "move.l  %%d2, (%[v1])+      \n"
 208         SUBHALFREGS(%%a3, %%d4, %%d3)
 209         "move.l  %%d3, (%[v1])+      \n"
 210         "lea.l   (16, %[v2]), %[v2]  \n"
 211
 212         "movem.l (%[v2]), %%d1-%%d4  \n"
 213         "movem.l (%[v1]), %%a0-%%a3  \n"
 214         SUBHALFREGS(%%a0, %%d1, %%d0)
 215         "move.l  %%d0, (%[v1])+      \n"
 216         SUBHALFREGS(%%a1, %%d2, %%d1)
 217         "move.l  %%d1, (%[v1])+      \n"
 218         SUBHALFREGS(%%a2, %%d3, %%d2)
 219         "move.l  %%d2, (%[v1])+      \n"
 220         SUBHALFREGS(%%a3, %%d4, %%d3)
 221         "move.l  %%d3, (%[v1])+      \n"
 222 #if ORDER > 16
 223         "lea.l   (16, %[v2]), %[v2]  \n"
 224
 225         "subq.l  #1, %[cnt]          \n"
 226         "bne.w   1b                  \n"
 227 #endif
 228
 229     "99:                             \n"
 230         : /* outputs */
 231 #if ORDER > 16
 232         [cnt]"+d"(cnt),
 233 #endif
 234         [v1] "+a"(v1),
 235         [v2] "+a"(v2)
 236         : /* inputs */
 237         : /* clobbers */
 238         "d0", "d1", "d2", "d3", "d4",
 239         "a0", "a1", "a2", "a3", "memory"
 240     );
 241 }
 242
 243 #define PREPARE_SCALARPRODUCT coldfire_set_macsr(0); /* signed integer mode */
 244
 245 /* This version fetches data as 32 bit words, and *recommends* v1 to be
 246  * 32 bit aligned, otherwise performance will suffer. It also needs EMAC
 247  * in signed integer mode - call above macro before use. */
 248 static inline int32_t scalarproduct(int16_t* v1, int16_t* v2)
 249 {
 250     int res;
 251 #if ORDER > 32
 252     int cnt = ORDER>>5;
 253 #endif
 254
 255 #if ORDER > 16
 256 #define MAC_BLOCKS "7"
 257 #else
 258 #define MAC_BLOCKS "3"
 259 #endif
 260
 261     asm volatile (
 262         "move.l  %[v2], %%d0                         \n"
 263         "and.l   #2, %%d0                            \n"
 264         "jeq     20f                                 \n"
 265
 266     "10:                                             \n"
 267         "move.l  (%[v1])+, %%d0                      \n"
 268         "move.w  (%[v2])+, %%d1                      \n"
 269     "1:                                              \n"
 270         ".rept " MAC_BLOCKS                         "\n"
 271         "mac.w   %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
 272         "mac.w   %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
 273         "mac.w   %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
 274         "mac.w   %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
 275         ".endr                                       \n"
 276
 277         "mac.w   %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
 278         "mac.w   %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
 279         "mac.w   %%d0u, %%d1l, (%[v2])+, %%d1, %%acc0\n"
 280 #if ORDER > 32
 281         "mac.w   %%d0l, %%d1u, (%[v1])+, %%d0, %%acc0\n"
 282         "subq.l  #1, %[res]                          \n"
 283         "bne.w   1b                                  \n"
 284 #else
 285         "mac.w   %%d0l, %%d1u, %%acc0                \n"
 286 #endif
 287         "jra     99f                                  \n"
 288
 289     "20:                                             \n"
 290         "move.l  (%[v1])+, %%d0                      \n"
 291         "move.l  (%[v2])+, %%d1                      \n"
 292     "1:                                              \n"
 293         ".rept " MAC_BLOCKS                         "\n"
 294         "mac.w   %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
 295         "mac.w   %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
 296         "mac.w   %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
 297         "mac.w   %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
 298         ".endr                                       \n"
 299
 300         "mac.w   %%d0u, %%d1u, (%[v1])+, %%d2, %%acc0\n"
 301         "mac.w   %%d0l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
 302 #if ORDER > 32
 303         "mac.w   %%d2u, %%d1u, (%[v1])+, %%d0, %%acc0\n"
 304         "mac.w   %%d2l, %%d1l, (%[v2])+, %%d1, %%acc0\n"
 305         "subq.l  #1, %[res]                          \n"
 306         "bne.w   1b                                  \n"
 307 #else
 308         "mac.w   %%d2u, %%d1u, %%acc0                \n"
 309         "mac.w   %%d2l, %%d1l, %%acc0                \n"
 310 #endif
 311
 312     "99:                                             \n"
 313         "movclr.l %%acc0, %[res]                     \n"
 314         : /* outputs */
 315         [v1]"+a"(v1),
 316         [v2]"+a"(v2),
 317         [res]"=d"(res)
 318         : /* inputs */
 319 #if ORDER > 32
 320         [cnt]"[res]"(cnt)
 321 #endif
 322         : /* clobbers */
 323         "d0", "d1", "d2"
 324     );
 325     return res;
 326 }