follin/utils.d

   1 /* Invisible Vector Library
   2  * coded by Ketmar // Invisible Vector <ketmar@ketmar.no-ip.org>
   3  * Understanding is not required. Only obedience.
   4  *
   5  * This program is free software: you can redistribute it and/or modify
   6  * it under the terms of the GNU General Public License as published by
   7  * the Free Software Foundation, version 3 of the License ONLY.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
  16  */
  17 module iv.follin.utils /*is aliced*/;
  18
  19 import iv.alice;
  20 import iv.follin.ftrick;
  21
  22
  23 // ////////////////////////////////////////////////////////////////////////// //
  24 version(X86) {
  25   version(D_PIC) {} else {
  26     version = follin_use_sse;
  27     version = follin_use_sse2;
  28   }
  29 }
  30
  31
  32 // ////////////////////////////////////////////////////////////////////////// //
  33 /// convert buffer of shorts to buffer of normalized ([-1..1]) floats; will not resize output
  34 public void tflShort2Float (in short[] input, float[] output) nothrow @trusted @nogc {
  35   if (output.length < input.length) assert(0, "invalid length");
  36   auto d = output.ptr;
  37   enum mul = cast(float)(1.0f/32768.0f);
  38   auto src = input.ptr;
  39   auto len = input.length;
  40   while (len >= 4) {
  41     *d++ = mul*(*src++);
  42     *d++ = mul*(*src++);
  43     *d++ = mul*(*src++);
  44     *d++ = mul*(*src++);
  45     len -= 4;
  46   }
  47   while (len-- > 0) *d++ = mul*(*src++);
  48 }
  49
  50
  51 private align(64) __gshared float[256] mvol = 32768.0;
  52 private __gshared ubyte* mvolptr = null;
  53
  54 shared static this () {
  55   mvolptr = cast(ubyte*)mvol.ptr;
  56   if ((cast(uint)mvolptr&0x3f) != 0) {
  57     // fix pointer
  58     mvolptr += 0x40-cast(uint)mvolptr&0x3f;
  59     // and refill
  60     (cast(float*)mvolptr)[0..8] = 32768.0;
  61   }
  62 }
  63
  64
  65 // will not resize output
  66 /// convert buffer of normalize floats ([-1..1]) to buffer of shorts; does float clamping; will not resize output
  67 public void tflFloat2Short (in float[] input, short[] output) nothrow @trusted @nogc {
  68   if (output.length < input.length) assert(0, "invalid length");
  69   auto s = input.ptr;
  70   auto d = output.ptr;
  71   /*ALIGN NOT WORKING YET:*/ version(follin_use_sse) {
  72     auto blen = cast(uint)input.length;
  73     if (blen > 0) {
  74       //TODO: use aligned instructions
  75       float[4] tmp = void;
  76       auto tmpptr = &tmp;
  77       asm nothrow @safe @nogc {
  78         mov       EAX,[mvolptr]; // source
  79         //movntdqa  XMM4,[EAX]; // XMM4: multipliers (sse4.1)
  80         movaps    XMM4,[EAX];
  81         mov       EAX,[s]; // source
  82         mov       EBX,[d]; // dest
  83         mov       ECX,[blen]; // number of numbers ;-)
  84         shr       ECX,2;
  85         jz        skip4part;
  86         // process 4 floats per step
  87         align 8;
  88        finalloopmix:
  89         movups    XMM0,[EAX];
  90         mulps     XMM0,XMM4;    // mul by volume and shift
  91       }
  92       version(follin_use_sse2) asm nothrow @safe @nogc {
  93         cvttps2dq XMM1,XMM0;    // XMM1 now contains four int32 values
  94         packssdw  XMM1,XMM1;
  95         movq      [EBX],XMM1;   // four s16 == one double
  96       } else asm nothrow @safe @nogc {
  97         cvtps2pi  MM0,XMM0;     // MM0 now contains two low int32 values
  98         movhlps   XMM5,XMM0;    // get high floats
  99         cvtps2pi  MM1,XMM5;     // MM1 now contains two high int32 values
 100         packssdw  MM0,MM1;      // MM0 now contains 4 int16 values
 101         movq      [EBX],MM0;
 102       }
 103       asm nothrow @safe @nogc {
 104         add       EAX,16;
 105         add       EBX,8;
 106         dec       ECX;
 107         jnz       finalloopmix;
 108        skip4part:;
 109         mov       [s],EAX;
 110         mov       [d],EBX;
 111       }
 112       version(follin_use_sse2) {} else {
 113         asm nothrow @safe @nogc { emms; }
 114       }
 115       mixin(declfcvar!"temp");
 116       switch ((blen &= 3)) {
 117         case 3:
 118           mixin(FAST_SCALED_FLOAT_TO_INT!("*s", "15"));
 119           if (cast(uint)(v+32768) > 65535) v = (v < 0 ? -32768 : 32767);
 120           *d++ = cast(short)v;
 121           ++s;
 122           goto case;
 123         case 2:
 124           mixin(FAST_SCALED_FLOAT_TO_INT!("*s", "15"));
 125           if (cast(uint)(v+32768) > 65535) v = (v < 0 ? -32768 : 32767);
 126           *d++ = cast(short)v;
 127           ++s;
 128           goto case;
 129         case 1:
 130           mixin(FAST_SCALED_FLOAT_TO_INT!("*s", "15"));
 131           if (cast(uint)(v+32768) > 65535) v = (v < 0 ? -32768 : 32767);
 132           *d++ = cast(short)v;
 133           ++s;
 134           break;
 135         default: break;
 136       }
 137     }
 138   } else {
 139     mixin(declfcvar!"temp");
 140     auto len = input.length;
 141     while (len >= 4) {
 142       {
 143         mixin(FAST_SCALED_FLOAT_TO_INT!("*s", "15"));
 144         if (cast(uint)(v+32768) > 65535) v = (v < 0 ? -32768 : 32767);
 145         *d++ = cast(short)v;
 146         ++s;
 147       }
 148       {
 149         mixin(FAST_SCALED_FLOAT_TO_INT!("*s", "15"));
 150         if (cast(uint)(v+32768) > 65535) v = (v < 0 ? -32768 : 32767);
 151         *d++ = cast(short)v;
 152         ++s;
 153       }
 154       {
 155         mixin(FAST_SCALED_FLOAT_TO_INT!("*s", "15"));
 156         if (cast(uint)(v+32768) > 65535) v = (v < 0 ? -32768 : 32767);
 157         *d++ = cast(short)v;
 158         ++s;
 159       }
 160       {
 161         mixin(FAST_SCALED_FLOAT_TO_INT!("*s", "15"));
 162         if (cast(uint)(v+32768) > 65535) v = (v < 0 ? -32768 : 32767);
 163         *d++ = cast(short)v;
 164         ++s;
 165       }
 166       len -= 4;
 167     }
 168     switch (len) {
 169       case 3:
 170         mixin(FAST_SCALED_FLOAT_TO_INT!("*s", "15"));
 171         if (cast(uint)(v+32768) > 65535) v = (v < 0 ? -32768 : 32767);
 172         *d++ = cast(short)v;
 173         ++s;
 174         goto case;
 175       case 2:
 176         mixin(FAST_SCALED_FLOAT_TO_INT!("*s", "15"));
 177         if (cast(uint)(v+32768) > 65535) v = (v < 0 ? -32768 : 32767);
 178         *d++ = cast(short)v;
 179         ++s;
 180         goto case;
 181       case 1:
 182         mixin(FAST_SCALED_FLOAT_TO_INT!("*s", "15"));
 183         if (cast(uint)(v+32768) > 65535) v = (v < 0 ? -32768 : 32767);
 184         *d++ = cast(short)v;
 185         ++s;
 186         break;
 187       default: break;
 188     }
 189   }
 190 }