include/gmx_sse2_single.h

   1 /*
   2  *                This source code is part of
   3  *
   4  *                 G   R   O   M   A   C   S
   5  *
   6  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   7  * Copyright (c) 2001-2009, The GROMACS Development Team
   8  *
   9  * Gromacs is a library for molecular simulation and trajectory analysis,
  10  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
  11  * a full list of developers and information, check out http://www.gromacs.org
  12  *
  13  * This program is free software; you can redistribute it and/or modify it under
  14  * the terms of the GNU Lesser General Public License as published by the Free
  15  * Software Foundation; either version 2 of the License, or (at your option) any
  16  * later version.
  17  * As a special exception, you may use this file as part of a free software
  18  * library without restriction.  Specifically, if other files instantiate
  19  * templates or use macros or inline functions from this file, or you compile
  20  * this file and link it with other files to produce an executable, this
  21  * file does not by itself cause the resulting executable to be covered by
  22  * the GNU Lesser General Public License.
  23  *
  24  * In plain-speak: do not worry about classes/macros/templates either - only
  25  * changes to the library have to be LGPL, not an application linking with it.
  26  *
  27  * To help fund GROMACS development, we humbly ask that you cite
  28  * the papers people have written on it - you can find them on the website!
  29  */
  30 #ifdef HAVE_CONFIG_H
  31 #include <config.h>
  32 #endif
  33
  34 /* We require SSE2 now! */
  35
  36 #include <math.h>
  37
  38
  39 #include <xmmintrin.h> /* SSE */
  40 #include <emmintrin.h> /* SSE2 */
  41
  42 #ifdef GMX_SSE3
  43 #  include <pmmintrin.h> /* SSE3 */
  44 #endif
  45 #ifdef GMX_SSE4
  46 #  include <smmintrin.h> /* SSE4.1 */
  47 #endif
  48
  49 #include <stdio.h>
  50
  51 /***************************************************
  52  *                                                 *
  53  * COMPILER RANT WARNING:                          *
  54  *                                                 *
  55  * Ideally, this header would be filled with       *
  56  * simple static inline functions. Unfortunately,  *
  57  * many vendors provide really braindead compilers *
  58  * that either cannot handle more than 1-2 SSE     *
  59  * function parameters, and some cannot handle     *
  60  * pointers to SSE __m128 datatypes as parameters  *
  61  * at all. Thus, for portability we have had to    *
  62  * implement all but the simplest routines as      *
  63  * macros instead...                               *
  64  *                                                 *
  65  ***************************************************/
  66
  67
  68 /***************************************************
  69  *                                                 *
  70  *   Wrappers/replacements for some instructions   *
  71  *   not available in all SSE versions.            *
  72  *                                                 *
  73  ***************************************************/
  74
  75 #ifdef GMX_SSE4
  76 #  define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32(x,imm)
  77 #else
  78 #  define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
  79 #endif
  80
  81 /*
  82  * Some compilers require a cast to change the interpretation
  83  * of a register from FP to Int and vice versa, and not all of
  84  * the provide instructions to do this. Roll our own wrappers...
  85  */
  86
  87 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
  88 #  define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
  89 #  define gmx_mm_castps_si128(a) _mm_castps_si128(a)
  90 #  define gmx_mm_castps_ps128(a) (a)
  91 #elif defined(__GNUC__)
  92 #  define gmx_mm_castsi128_ps(a) ((__m128)(a))
  93 #  define gmx_mm_castps_si128(a) ((__m128i)(a))
  94 #  define gmx_mm_castps_ps128(a) ((__m128)(a))
  95 #else
  96 static __m128  gmx_mm_castsi128_ps(__m128i a) { return *(__m128 *) &a;  }
  97 static __m128i gmx_mm_castps_si128(__m128 a)  { return *(__m128i *) &a; }
  98 static __m128  gmx_mm_castps_ps128(__m128 a) { return *(__m128 *) &a;  }
  99 #endif
 100
 101
 102
 103 /* IO functions, just for debugging */
 104
 105 static void
 106 printxmm(const char *s,__m128 xmm)
 107 {
 108         float f[4];
 109
 110         _mm_storeu_ps(f,xmm);
 111         printf("%s: %8.5g %8.5g %8.5g %8.5g\n",s,f[0],f[1],f[2],f[3]);
 112 }
 113
 114
 115 static void
 116 printxmmsum(const char *s,__m128 xmm)
 117 {
 118         float f[4];
 119
 120         _mm_storeu_ps(f,xmm);
 121         printf("%s (sum): %15.10g\n",s,f[0]+f[1]+f[2]+f[3]);
 122 }
 123
 124
 125 static void
 126 printxmmi(const char *s,__m128i xmmi)
 127 {
 128     int i[4];
 129
 130     _mm_storeu_si128((__m128i *)i,xmmi);
 131     printf("%10s: %2d %2d %2d %2d\n",s,i[0],i[1],i[2],i[3]);
 132 }
 133
 134
 135 /************************
 136  *                      *
 137  * Simple math routines *
 138  *                      *
 139  ************************/
 140
 141 static inline __m128
 142 gmx_mm_invsqrt_ps(__m128 x)
 143 {
 144     const __m128 half  = {0.5,0.5,0.5,0.5};
 145     const __m128 three = {3.0,3.0,3.0,3.0};
 146
 147     __m128 lu = _mm_rsqrt_ps(x);
 148
 149     return _mm_mul_ps(half,_mm_mul_ps(_mm_sub_ps(three,_mm_mul_ps(_mm_mul_ps(lu,lu),x)),lu));
 150 }
 151
 152 static inline __m128
 153 gmx_mm_inv_ps(__m128 x)
 154 {
 155         const __m128 two = {2.0f,2.0f,2.0f,2.0f};
 156
 157     __m128 lu = _mm_rcp_ps(x);
 158
 159         return _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,x)));
 160 }
 161
 162
 163 static inline __m128
 164 gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
 165 {
 166     return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx,dx), _mm_mul_ps(dy,dy) ), _mm_mul_ps(dz,dz) );
 167 }
 168
 169 /* Normal sum of four xmm registers */
 170 static inline __m128
 171 gmx_mm_sum4_ps(__m128 t0, __m128 t1, __m128 t2, __m128 t3)
 172 {
 173     t0 = _mm_add_ps(t0,t1);
 174     t2 = _mm_add_ps(t2,t3);
 175     return _mm_add_ps(t0,t2);
 176 }
 177
 178
 179 static __m128
 180 gmx_mm_log_ps(__m128 x)
 181 {
 182         const __m128 exp_ps  = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
 183         const __m128 one_ps  = gmx_mm_castsi128_ps( _mm_set_epi32(0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000) );
 184         const __m128 off_ps  = gmx_mm_castsi128_ps( _mm_set_epi32(0x3FBF8000, 0x3FBF8000, 0x3FBF8000, 0x3FBF8000) );
 185         const __m128 mant_ps = gmx_mm_castsi128_ps( _mm_set_epi32(0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF) );
 186         const __m128 base_ps = gmx_mm_castsi128_ps( _mm_set_epi32(0x43800000, 0x43800000, 0x43800000, 0x43800000) );
 187         const __m128 loge_ps = gmx_mm_castsi128_ps( _mm_set_epi32(0x3F317218, 0x3F317218, 0x3F317218, 0x3F317218) );
 188
 189         const __m128 D5      = gmx_mm_castsi128_ps( _mm_set_epi32(0xBD0D0CC5, 0xBD0D0CC5, 0xBD0D0CC5, 0xBD0D0CC5) );
 190         const __m128 D4      = gmx_mm_castsi128_ps( _mm_set_epi32(0x3EA2ECDD, 0x3EA2ECDD, 0x3EA2ECDD, 0x3EA2ECDD) );
 191         const __m128 D3      = gmx_mm_castsi128_ps( _mm_set_epi32(0xBF9dA2C9, 0xBF9dA2C9, 0xBF9dA2C9, 0xBF9dA2C9) );
 192         const __m128 D2      = gmx_mm_castsi128_ps( _mm_set_epi32(0x4026537B, 0x4026537B, 0x4026537B, 0x4026537B) );
 193         const __m128 D1      = gmx_mm_castsi128_ps( _mm_set_epi32(0xC054bFAD, 0xC054bFAD, 0xC054bFAD, 0xC054bFAD) );
 194         const __m128 D0      = gmx_mm_castsi128_ps( _mm_set_epi32(0x4047691A, 0x4047691A, 0x4047691A, 0x4047691A) );
 195
 196         __m128  xmm0,xmm1,xmm2;
 197
 198         xmm0  = x;
 199         xmm1  = xmm0;
 200         xmm1  = _mm_and_ps(xmm1, exp_ps);
 201         xmm1 = gmx_mm_castsi128_ps( _mm_srli_epi32( gmx_mm_castps_si128(xmm1),8) );
 202
 203         xmm1  = _mm_or_ps(xmm1, one_ps);
 204         xmm1  = _mm_sub_ps(xmm1, off_ps);
 205
 206         xmm1  = _mm_mul_ps(xmm1, base_ps);
 207         xmm0  = _mm_and_ps(xmm0, mant_ps);
 208         xmm0  = _mm_or_ps(xmm0, one_ps);
 209
 210         xmm2  = _mm_mul_ps(xmm0, D5);
 211         xmm2  = _mm_add_ps(xmm2, D4);
 212         xmm2  = _mm_mul_ps(xmm2,xmm0);
 213         xmm2  = _mm_add_ps(xmm2, D3);
 214         xmm2  = _mm_mul_ps(xmm2,xmm0);
 215         xmm2  = _mm_add_ps(xmm2, D2);
 216         xmm2  = _mm_mul_ps(xmm2,xmm0);
 217         xmm2  = _mm_add_ps(xmm2, D1);
 218         xmm2  = _mm_mul_ps(xmm2,xmm0);
 219         xmm2  = _mm_add_ps(xmm2, D0);
 220         xmm0  = _mm_sub_ps(xmm0, one_ps);
 221         xmm0  = _mm_mul_ps(xmm0,xmm2);
 222         xmm1  = _mm_add_ps(xmm1,xmm0);
 223
 224         x     = xmm1;
 225         x  = _mm_mul_ps(x, loge_ps);
 226
 227     return x;
 228 }
 229
 230
 231 /* This exp-routine has a relative precision of:
 232  *   2^-22.33 bits (essentially single precision :-)
 233  * WARNING: no check against over or underflows (x beyond +-87)
 234  */
 235 static __m128
 236 gmx_mm_exp_ps(__m128 x)
 237 {
 238     const __m128i half = _mm_set_epi32(0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000);   // 0.5e+0f
 239     const __m128i base = _mm_set_epi32(0x0000007F, 0x0000007F, 0x0000007F, 0x0000007F);   // 127
 240     const __m128i CC   = _mm_set_epi32(0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B);   // log2(e)
 241
 242     const __m128i D5   = _mm_set_epi32(0x3AF61905, 0x3AF61905, 0x3AF61905, 0x3AF61905);   // 1.8775767e-3f
 243     const __m128i D4   = _mm_set_epi32(0x3C134806, 0x3C134806, 0x3C134806, 0x3C134806);   // 8.9893397e-3f
 244     const __m128i D3   = _mm_set_epi32(0x3D64AA23, 0x3D64AA23, 0x3D64AA23, 0x3D64AA23);   // 5.5826318e-2f
 245     const __m128i D2   = _mm_set_epi32(0x3E75EAD4, 0x3E75EAD4, 0x3E75EAD4, 0x3E75EAD4);   // 2.4015361e-1f
 246     const __m128i D1   = _mm_set_epi32(0x3F31727B, 0x3F31727B, 0x3F31727B, 0x3F31727B);   // 6.9315308e-1f
 247     const __m128i D0   = _mm_set_epi32(0x3F7FFFFF, 0x3F7FFFFF, 0x3F7FFFFF, 0x3F7FFFFF);   // 9.9999994e-1f
 248
 249         __m128 xmm0,xmm1;
 250         __m128i xmm2;
 251
 252         xmm0 = _mm_mul_ps(x,gmx_mm_castsi128_ps(CC));
 253         xmm1 = _mm_sub_ps(xmm0,gmx_mm_castsi128_ps(half));
 254         xmm2 = _mm_cvtps_epi32(xmm1);
 255         xmm1 = _mm_cvtepi32_ps(xmm2);
 256
 257         xmm2 = _mm_add_epi32(xmm2,gmx_mm_castps_si128(base));
 258         xmm2 = _mm_slli_epi32(xmm2,23);
 259
 260         xmm0 = _mm_sub_ps(xmm0,xmm1);
 261         xmm1 = _mm_mul_ps(xmm0,gmx_mm_castsi128_ps(D5));
 262         xmm1 = _mm_add_ps(xmm1,gmx_mm_castsi128_ps(D4));
 263         xmm1 = _mm_mul_ps(xmm1,xmm0);
 264         xmm1 = _mm_add_ps(xmm1,gmx_mm_castsi128_ps(D3));
 265         xmm1 = _mm_mul_ps(xmm1,xmm0);
 266         xmm1 = _mm_add_ps(xmm1,gmx_mm_castsi128_ps(D2));
 267         xmm1 = _mm_mul_ps(xmm1,xmm0);
 268         xmm1 = _mm_add_ps(xmm1,gmx_mm_castsi128_ps(D1));
 269         xmm1 = _mm_mul_ps(xmm1,xmm0);
 270         xmm1 = _mm_add_ps(xmm1,gmx_mm_castsi128_ps(D0));
 271         xmm1 = _mm_mul_ps(xmm1,gmx_mm_castsi128_ps(xmm2));
 272
 273         /* 18 instructions currently */
 274         return xmm1;
 275 }
 276
 277
 278 /* Same as gmx_mm_exp_ps, but has a lower bound check, such that it can
 279  * be safely called with x < -87.33.
 280  * WARNING: no check against overflows (x > 87)
 281  */
 282 static __m128
 283 gmx_mm_exp_ps_lbc(__m128 x)
 284 {
 285     const __m128i lim  = _mm_set_epi32(0xC2AE0000, 0xC2AE0000, 0xC2AE0000, 0xC2AE0000);   // -87
 286     const __m128i half = _mm_set_epi32(0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000);   // 0.5e+0f
 287     const __m128i base = _mm_set_epi32(0x0000007F, 0x0000007F, 0x0000007F, 0x0000007F);   // 127
 288     const __m128i CC   = _mm_set_epi32(0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B);   // log2(e)
 289
 290     const __m128i D5   = _mm_set_epi32(0x3AF61905, 0x3AF61905, 0x3AF61905, 0x3AF61905);   // 1.8775767e-3f
 291     const __m128i D4   = _mm_set_epi32(0x3C134806, 0x3C134806, 0x3C134806, 0x3C134806);   // 8.9893397e-3f
 292     const __m128i D3   = _mm_set_epi32(0x3D64AA23, 0x3D64AA23, 0x3D64AA23, 0x3D64AA23);   // 5.5826318e-2f
 293     const __m128i D2   = _mm_set_epi32(0x3E75EAD4, 0x3E75EAD4, 0x3E75EAD4, 0x3E75EAD4);   // 2.4015361e-1f
 294     const __m128i D1   = _mm_set_epi32(0x3F31727B, 0x3F31727B, 0x3F31727B, 0x3F31727B);   // 6.9315308e-1f
 295     const __m128i D0   = _mm_set_epi32(0x3F7FFFFF, 0x3F7FFFFF, 0x3F7FFFFF, 0x3F7FFFFF);   // 9.9999994e-1f
 296
 297         __m128 xmm0,xmm1;
 298         __m128i xmm2;
 299
 300         xmm1 = _mm_max_ps(x,gmx_mm_castsi128_ps(lim)); /* x<-87 gives exp(-87) */
 301         xmm0 = _mm_mul_ps(xmm1,gmx_mm_castsi128_ps(CC));
 302         xmm1 = _mm_sub_ps(xmm0,gmx_mm_castsi128_ps(half));
 303         xmm2 = _mm_cvtps_epi32(xmm1);
 304         xmm1 = _mm_cvtepi32_ps(xmm2);
 305
 306         xmm2 = _mm_add_epi32(xmm2,gmx_mm_castps_si128(base));
 307         xmm2 = _mm_slli_epi32(xmm2,23);
 308
 309         xmm0 = _mm_sub_ps(xmm0,xmm1);
 310         xmm1 = _mm_mul_ps(xmm0,gmx_mm_castsi128_ps(D5));
 311         xmm1 = _mm_add_ps(xmm1,gmx_mm_castsi128_ps(D4));
 312         xmm1 = _mm_mul_ps(xmm1,xmm0);
 313         xmm1 = _mm_add_ps(xmm1,gmx_mm_castsi128_ps(D3));
 314         xmm1 = _mm_mul_ps(xmm1,xmm0);
 315         xmm1 = _mm_add_ps(xmm1,gmx_mm_castsi128_ps(D2));
 316         xmm1 = _mm_mul_ps(xmm1,xmm0);
 317         xmm1 = _mm_add_ps(xmm1,gmx_mm_castsi128_ps(D1));
 318         xmm1 = _mm_mul_ps(xmm1,xmm0);
 319         xmm1 = _mm_add_ps(xmm1,gmx_mm_castsi128_ps(D0));
 320         xmm1 = _mm_mul_ps(xmm1,gmx_mm_castsi128_ps(xmm2));
 321
 322         /* 19 instructions currently + pipeline latenct after max_ps */
 323         return xmm1;
 324 }
 325
 326
 327 #define GMX_MM_SINCOS_PS(x,sinval,cosval)                                                                    \
 328 {                                                                                                            \
 329         const __m128 _sincosf_two_over_pi = {2.0/M_PI,2.0/M_PI,2.0/M_PI,2.0/M_PI};                               \
 330     const __m128 _sincosf_half        = {0.5,0.5,0.5,0.5};                                                   \
 331     const __m128 _sincosf_one         = {1.0,1.0,1.0,1.0};                                                   \
 332                                                                                                              \
 333         const __m128i _sincosf_izero      = _mm_set1_epi32(0);                                                   \
 334     const __m128i _sincosf_ione       = _mm_set1_epi32(1);                                                   \
 335     const __m128i _sincosf_itwo       = _mm_set1_epi32(2);                                                   \
 336     const __m128i _sincosf_ithree     = _mm_set1_epi32(3);                                                   \
 337                                                                                                              \
 338         const __m128 _sincosf_kc1 = {1.57079625129,1.57079625129,1.57079625129,1.57079625129};                   \
 339     const __m128 _sincosf_kc2 = {7.54978995489e-8,7.54978995489e-8,7.54978995489e-8,7.54978995489e-8};       \
 340         const __m128 _sincosf_cc0 = {-0.0013602249,-0.0013602249,-0.0013602249,-0.0013602249};                   \
 341     const __m128 _sincosf_cc1 = {0.0416566950,0.0416566950,0.0416566950,0.0416566950};                       \
 342     const __m128 _sincosf_cc2 = {-0.4999990225,-0.4999990225,-0.4999990225,-0.4999990225};                   \
 343         const __m128 _sincosf_sc0 = {-0.0001950727,-0.0001950727,-0.0001950727,-0.0001950727};                   \
 344     const __m128 _sincosf_sc1 = {0.0083320758,0.0083320758,0.0083320758,0.0083320758};                       \
 345     const __m128 _sincosf_sc2 = {-0.1666665247,-0.1666665247,-0.1666665247,-0.1666665247};                   \
 346                                                                                                              \
 347         __m128 _sincosf_signbit           = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );                   \
 348     __m128 _sincosf_tiny              = gmx_mm_castsi128_ps( _mm_set1_epi32(0x3e400000) );                   \
 349                                                                                                              \
 350         __m128 _sincosf_xl;                                                                                      \
 351     __m128 _sincosf_xl2;                                                                                     \
 352     __m128 _sincosf_xl3;                                                                                     \
 353     __m128 _sincosf_qf;                                                                                      \
 354     __m128 _sincosf_absxl;                                                                                   \
 355     __m128 _sincosf_p1;                                                                                      \
 356     __m128 _sincosf_cx;                                                                                      \
 357     __m128 _sincosf_sx;                                                                                      \
 358     __m128 _sincosf_ts;                                                                                      \
 359     __m128 _sincosf_tc;                                                                                      \
 360     __m128 _sincosf_tsn;                                                                                     \
 361     __m128 _sincosf_tcn;                                                                                     \
 362         __m128i _sincosf_q;                                                                                      \
 363     __m128i _sincosf_offsetSin;                                                                              \
 364     __m128i _sincosf_offsetCos;                                                                              \
 365     __m128 _sincosf_sinMask;                                                                                 \
 366     __m128 _sincosf_cosMask;                                                                                 \
 367     __m128 _sincosf_isTiny;                                                                                  \
 368     __m128 _sincosf_ct0;                                                                                     \
 369     __m128 _sincosf_ct1;                                                                                     \
 370     __m128 _sincosf_ct2;                                                                                     \
 371     __m128 _sincosf_st1;                                                                                     \
 372     __m128 _sincosf_st2;                                                                                     \
 373                                                                                                              \
 374     _sincosf_xl        = _mm_mul_ps(x,_sincosf_two_over_pi);                                                 \
 375                                                                                                              \
 376     _sincosf_xl        = _mm_add_ps(_sincosf_xl,_mm_or_ps(_mm_and_ps(_sincosf_xl,_sincosf_signbit),_sincosf_half)); \
 377                                                                                                                  \
 378     _sincosf_q         = _mm_cvttps_epi32(_sincosf_xl);                                                      \
 379     _sincosf_qf        = _mm_cvtepi32_ps(_sincosf_q);                                                        \
 380                                                                                                                  \
 381     _sincosf_offsetSin   = _mm_and_si128(_sincosf_q,_sincosf_ithree);                                        \
 382     _sincosf_offsetCos   = _mm_add_epi32(_sincosf_offsetSin,_sincosf_ione);                                  \
 383                                                                                                              \
 384     _sincosf_p1 = _mm_mul_ps(_sincosf_qf,_sincosf_kc1);                                                      \
 385     _sincosf_xl = _mm_mul_ps(_sincosf_qf,_sincosf_kc2);                                                      \
 386     _sincosf_p1 = _mm_sub_ps(x,_sincosf_p1);                                                                 \
 387     _sincosf_xl = _mm_sub_ps(_sincosf_p1,_sincosf_xl);                                                       \
 388                                                                                                              \
 389     _sincosf_absxl  = _mm_andnot_ps(_sincosf_signbit,_sincosf_xl);                                           \
 390     _sincosf_isTiny = _mm_cmpgt_ps(_sincosf_tiny,_sincosf_absxl);                                            \
 391                                                                                                              \
 392     _sincosf_xl2    = _mm_mul_ps(_sincosf_xl,_sincosf_xl);                                                   \
 393     _sincosf_xl3    = _mm_mul_ps(_sincosf_xl2,_sincosf_xl);                                                  \
 394                                                                                                                  \
 395         _sincosf_ct1    = _mm_mul_ps(_sincosf_cc0,_sincosf_xl2);                                                 \
 396         _sincosf_ct1    = _mm_add_ps(_sincosf_ct1,_sincosf_cc1);                                                 \
 397         _sincosf_st1    = _mm_mul_ps(_sincosf_sc0,_sincosf_xl2);                                                 \
 398         _sincosf_st1    = _mm_add_ps(_sincosf_st1,_sincosf_sc1);                                                 \
 399         _sincosf_ct2    = _mm_mul_ps(_sincosf_ct1,_sincosf_xl2);                                                 \
 400         _sincosf_ct2    = _mm_add_ps(_sincosf_ct2,_sincosf_cc2);                                                 \
 401         _sincosf_st2    = _mm_mul_ps(_sincosf_st1,_sincosf_xl2);                                                 \
 402         _sincosf_st2    = _mm_add_ps(_sincosf_st2,_sincosf_sc2);                                                 \
 403                                                                                                                  \
 404         _sincosf_cx     = _mm_mul_ps(_sincosf_ct2,_sincosf_xl2);                                                 \
 405     _sincosf_cx     = _mm_add_ps(_sincosf_cx,_sincosf_one);                                                  \
 406                                                                                                              \
 407     _sincosf_sx     = _mm_mul_ps(_sincosf_st2,_sincosf_xl3);                                                 \
 408     _sincosf_sx     = _mm_add_ps(_sincosf_sx,_sincosf_xl);                                                   \
 409                                                                                                              \
 410     _sincosf_sinMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetSin,_sincosf_ione), _sincosf_izero) ); \
 411     _sincosf_cosMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetCos,_sincosf_ione), _sincosf_izero) ); \
 412                                                                                                              \
 413     _sincosf_ts     = _mm_or_ps( _mm_and_ps(_sincosf_sinMask,_sincosf_sx) , _mm_andnot_ps(_sincosf_sinMask,_sincosf_cx) ); \
 414     _sincosf_tc     = _mm_or_ps( _mm_and_ps(_sincosf_cosMask,_sincosf_sx) , _mm_andnot_ps(_sincosf_cosMask,_sincosf_cx) ); \
 415                                                                                                                  \
 416     _sincosf_sinMask = gmx_mm_castsi128_ps(  _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetSin,_sincosf_itwo), _sincosf_izero) );\
 417     _sincosf_tsn    = _mm_xor_ps(_sincosf_signbit,_sincosf_ts);                                              \
 418     _sincosf_ts     = _mm_or_ps( _mm_and_ps(_sincosf_sinMask,_sincosf_ts) , _mm_andnot_ps(_sincosf_sinMask,_sincosf_tsn) ); \
 419                                                                                                                  \
 420     _sincosf_cosMask = gmx_mm_castsi128_ps(  _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetCos,_sincosf_itwo), _sincosf_izero) ); \
 421     _sincosf_tcn    = _mm_xor_ps(_sincosf_signbit,_sincosf_tc);                                              \
 422     _sincosf_tc     = _mm_or_ps( _mm_and_ps(_sincosf_cosMask,_sincosf_tc) , _mm_andnot_ps(_sincosf_cosMask,_sincosf_tcn) ); \
 423                                                                                                                  \
 424     sinval = _sincosf_ts;                                                                                    \
 425     cosval = _sincosf_tc;                                                                                    \
 426 }
 427
 428
 429
 430 /* Load a single value from 1-4 places, merge into xmm register */
 431
 432 #define GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
 433 {                                                         \
 434     __m128 _txmm2,_txmm3,_txmm4;                          \
 435     xmm1           = _mm_load_ss(ptr1);                   \
 436     _txmm2         = _mm_load_ss(ptr2);                   \
 437     _txmm3         = _mm_load_ss(ptr3);                   \
 438     _txmm4         = _mm_load_ss(ptr4);                   \
 439     xmm1           = _mm_unpacklo_ps(xmm1,_txmm3);        \
 440     _txmm2         = _mm_unpacklo_ps(_txmm2,_txmm4);      \
 441     xmm1           = _mm_unpacklo_ps(xmm1,_txmm2);        \
 442 }
 443
 444
 445 #define GMX_MM_LOAD_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
 446 {                                                    \
 447     __m128 _txmm2,_txmm3;                            \
 448     xmm1           = _mm_load_ss(ptr1);              \
 449     _txmm2         = _mm_load_ss(ptr2);              \
 450     _txmm3         = _mm_load_ss(ptr3);              \
 451     xmm1           = _mm_unpacklo_ps(xmm1,_txmm3);   \
 452     xmm1           = _mm_unpacklo_ps(xmm1,_txmm2);   \
 453 }
 454
 455
 456 #define GMX_MM_LOAD_2VALUES_PS(ptr1,ptr2,xmm1)    \
 457 {                                                  \
 458     __m128 _txmm2;                                 \
 459     xmm1           = _mm_load_ss(ptr1);            \
 460     _txmm2         = _mm_load_ss(ptr2);            \
 461     xmm1           = _mm_unpacklo_ps(xmm1,_txmm2); \
 462 }
 463
 464
 465 #define GMX_MM_LOAD_1VALUE_PS(ptr1,xmm1) \
 466 {                                         \
 467       xmm1           = _mm_load_ss(ptr1); \
 468 }
 469
 470 /* Store data in an xmm register into 1-4 different places */
 471 #define GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1)             \
 472 {                                                                      \
 473     __m128 _txmm2,_txmm3,_txmm4;                                       \
 474     _txmm3       = _mm_movehl_ps(_mm_setzero_ps(),xmm1);               \
 475     _txmm2       = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));     \
 476     _txmm4       = _mm_shuffle_ps(_txmm3,_txmm3,_MM_SHUFFLE(1,1,1,1)); \
 477     _mm_store_ss(ptr1,xmm1);                                           \
 478     _mm_store_ss(ptr2,_txmm2);                                         \
 479     _mm_store_ss(ptr3,_txmm3);                                         \
 480     _mm_store_ss(ptr4,_txmm4);                                         \
 481 }
 482
 483
 484 #define GMX_MM_STORE_3VALUES_PS(ptr1,ptr2,ptr3,xmm1)              \
 485 {                                                                  \
 486     __m128 _txmm2,_txmm3;                                          \
 487     _txmm3       = _mm_movehl_ps(_mm_setzero_ps(),xmm1);           \
 488     _txmm2       = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
 489     _mm_store_ss(ptr1,xmm1);                                       \
 490     _mm_store_ss(ptr2,_txmm2);                                     \
 491     _mm_store_ss(ptr3,_txmm3);                                     \
 492 }
 493
 494
 495 #define GMX_MM_STORE_2VALUES_PS(ptr1,ptr2,xmm1)                   \
 496 {                                                                  \
 497     __m128 _txmm2;                                                 \
 498     _txmm2       = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
 499     _mm_store_ss(ptr1,xmm1);                                       \
 500     _mm_store_ss(ptr2,_txmm2);                                     \
 501 }
 502
 503
 504 #define GMX_MM_STORE_1VALUE_PS(ptr1,xmm1) \
 505 {                                          \
 506     _mm_store_ss(ptr1,xmm1);               \
 507 }
 508
 509
 510 /* Similar to store, but increments value in memory */
 511 #define GMX_MM_INCREMENT_8VALUES_PS(ptr1,ptr2,ptr3,ptr4,ptr5,ptr6,ptr7,ptr8,xmm1,xmm2)    \
 512 {                                                                  \
 513     __m128 _tincr1,_tincr2;                                        \
 514     GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr1);          \
 515     GMX_MM_LOAD_4VALUES_PS(ptr5,ptr6,ptr7,ptr8,_tincr2);          \
 516     _tincr1 = _mm_add_ps(_tincr1,xmm1);                            \
 517     _tincr2 = _mm_add_ps(_tincr2,xmm2);                            \
 518     GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr1);         \
 519     GMX_MM_STORE_4VALUES_PS(ptr5,ptr6,ptr7,ptr8,_tincr2);         \
 520 }
 521
 522 #define GMX_MM_INCREMENT_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1)    \
 523 {                                                                 \
 524     __m128 _tincr;                                                \
 525     GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr);          \
 526     _tincr = _mm_add_ps(_tincr,xmm1);                             \
 527     GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr);         \
 528 }
 529
 530 #define GMX_MM_INCREMENT_3VALUES_PS(ptr1,ptr2,ptr3,xmm1)         \
 531 {                                                                 \
 532     __m128 _tincr;                                                \
 533     GMX_MM_LOAD_3VALUES_PS(ptr1,ptr2,ptr3,_tincr);               \
 534     _tincr = _mm_add_ps(_tincr,xmm1);                             \
 535     GMX_MM_STORE_3VALUES_PS(ptr1,ptr2,ptr3,_tincr);              \
 536 }
 537
 538 #define GMX_MM_INCREMENT_2VALUES_PS(ptr1,ptr2,xmm1)         \
 539 {                                                            \
 540     __m128 _tincr;                                           \
 541     GMX_MM_LOAD_2VALUES_PS(ptr1,ptr2,_tincr);               \
 542     _tincr = _mm_add_ps(_tincr,xmm1);                        \
 543     GMX_MM_STORE_2VALUES_PS(ptr1,ptr2,_tincr);              \
 544 }
 545
 546 #define GMX_MM_INCREMENT_1VALUE_PS(ptr1,xmm1)         \
 547 {                                                      \
 548     __m128 _tincr;                                     \
 549     GMX_MM_LOAD_1VALUE_PS(ptr1,_tincr);               \
 550     _tincr = _mm_add_ss(_tincr,xmm1);                  \
 551     GMX_MM_STORE_1VALUE_PS(ptr1,_tincr);              \
 552 }
 553
 554
 555
 556 /* Routines to load pairs from 1-4 places, put in two separate xmm registers. Useful to load LJ parameters! */
 557 #define GMX_MM_LOAD_4PAIRS_PS(ptr1,ptr2,ptr3,ptr4,c6,c12)    \
 558 {                                                             \
 559     __m128 _tmp1,_tmp2,_tmp3,_tmp4;                           \
 560     _tmp1  = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1));  \
 561     _tmp2  = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2));  \
 562     _tmp3  = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3));  \
 563     _tmp4  = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr4));  \
 564     _tmp1  = _mm_unpacklo_ps(_tmp1,_tmp3);                    \
 565     _tmp2  = _mm_unpacklo_ps(_tmp2,_tmp4);                    \
 566     c6     = _mm_unpacklo_ps(_tmp1,_tmp2);                    \
 567     c12    = _mm_unpackhi_ps(_tmp1,_tmp2);                    \
 568 }
 569
 570 #define GMX_MM_LOAD_3PAIRS_PS(ptr1,ptr2,ptr3,c6,c12)        \
 571 {                                                            \
 572     __m128 _tmp1,_tmp2,_tmp3;                                \
 573     _tmp1  = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
 574     _tmp2  = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
 575     _tmp3  = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3)); \
 576     _tmp1  = _mm_unpacklo_ps(_tmp1,_tmp3);                   \
 577     _tmp2  = _mm_unpacklo_ps(_tmp2,_mm_setzero_ps());        \
 578     c6     = _mm_unpacklo_ps(_tmp1,_tmp2);                   \
 579     c12    = _mm_unpackhi_ps(_tmp1,_tmp2);                   \
 580 }
 581
 582
 583 #define GMX_MM_LOAD_2PAIRS_PS(ptr1,ptr2,c6,c12)             \
 584 {                                                            \
 585     __m128 _tmp1,_tmp2;                                      \
 586     _tmp1  = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
 587     _tmp2  = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
 588     c6     = _mm_unpacklo_ps(_tmp1,_tmp2);                   \
 589     c12    = _mm_movehl_ps(c12,c6);                          \
 590 }
 591
 592 #define GMX_MM_LOAD_1PAIR_PS(ptr1,c6,c12)                   \
 593 {                                                            \
 594     c6     = _mm_load_ss(ptr1);                              \
 595     c12    = _mm_load_ss(ptr1+1);                            \
 596 }
 597
 598
 599 /* Routines to load 1-4 rvecs from 1-4 places.
 600  * We mainly use these to load coordinates. The extra routines
 601  * are very efficient for the water-water loops, since we e.g.
 602  * know that a TIP4p water has 4 atoms, so we should load 12 floats+shuffle.
 603  */
 604 #define GMX_MM_LOAD_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) {             \
 605          jx1            = _mm_load_ss(ptr1);                                \
 606      jy1            = _mm_load_ss((ptr1)+1);                            \
 607      jz1            = _mm_load_ss((ptr1)+2);                            \
 608 }
 609
 610 #define GMX_MM_LOAD_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) {      \
 611          jx1            = _mm_load_ss(ptr1);                                      \
 612      jy1            = _mm_load_ss((ptr1)+1);                                  \
 613      jz1            = _mm_load_ss((ptr1)+2);                                  \
 614          jx2            = _mm_load_ss((ptr1)+3);                                  \
 615      jy2            = _mm_load_ss((ptr1)+4);                                  \
 616      jz2            = _mm_load_ss((ptr1)+5);                                  \
 617 }
 618
 619
 620 #define GMX_MM_LOAD_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
 621          jx1            = _mm_load_ss(ptr1);                                    \
 622      jy1            = _mm_load_ss((ptr1)+1);                                \
 623      jz1            = _mm_load_ss((ptr1)+2);                                \
 624          jx2            = _mm_load_ss((ptr1)+3);                                \
 625      jy2            = _mm_load_ss((ptr1)+4);                                \
 626      jz2            = _mm_load_ss((ptr1)+5);                                \
 627          jx3            = _mm_load_ss((ptr1)+6);                                \
 628      jy3            = _mm_load_ss((ptr1)+7);                                \
 629      jz3            = _mm_load_ss((ptr1)+8);                                \
 630 }
 631
 632
 633 #define GMX_MM_LOAD_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
 634          jx1            = _mm_load_ss(ptr1);                                    \
 635      jy1            = _mm_load_ss((ptr1)+1);                                \
 636      jz1            = _mm_load_ss((ptr1)+2);                                \
 637          jx2            = _mm_load_ss((ptr1)+3);                                \
 638      jy2            = _mm_load_ss((ptr1)+4);                                \
 639      jz2            = _mm_load_ss((ptr1)+5);                                \
 640          jx3            = _mm_load_ss((ptr1)+6);                                \
 641      jy3            = _mm_load_ss((ptr1)+7);                                \
 642      jz3            = _mm_load_ss((ptr1)+8);                                \
 643          jx4            = _mm_load_ss((ptr1)+9);                                \
 644      jy4            = _mm_load_ss((ptr1)+10);                               \
 645      jz4            = _mm_load_ss((ptr1)+11);                               \
 646 }
 647
 648
 649 #define GMX_MM_LOAD_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) {  \
 650       __m128 _tmp1,_tmp2;                                           \
 651       _tmp1           = _mm_load_ss(ptr1);                          \
 652           _tmp2           = _mm_load_ss(ptr2);                          \
 653       _tmp1           = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1));      \
 654       _tmp2           = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1));      \
 655       jx1             = _mm_unpacklo_ps(_tmp1,_tmp2);               \
 656       jy1             = _mm_unpackhi_ps(_tmp1,_tmp2);               \
 657           jx1             = _mm_unpacklo_ps(_tmp1,_tmp2);               \
 658       jz1             = _mm_movehl_ps(jz1,jy1);                     \
 659 }
 660
 661 #define GMX_MM_LOAD_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
 662      __m128 _tmp1, _tmp2;                                                      \
 663          _tmp1          = _mm_loadu_ps(ptr1);                                      \
 664      jy1            = _mm_loadu_ps(ptr2);                                      \
 665      jy2            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));        \
 666      _tmp2          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4));        \
 667      jx1            = _mm_unpacklo_ps(_tmp1,jy1);                              \
 668      jz1            = _mm_unpackhi_ps(_tmp1,jy1);                              \
 669      jy2            = _mm_unpacklo_ps(jy2,_tmp2);                              \
 670      jy1            = _mm_movehl_ps(jx1,jx1);                                  \
 671      jx2            = _mm_movehl_ps(jz1,jz1);                                  \
 672      jz2            = _mm_movehl_ps(jy2,jy2);                                  \
 673 }
 674
 675
 676 #define GMX_MM_LOAD_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
 677      __m128 _tmp1, _tmp2, _tmp3;                                                           \
 678          _tmp1          = _mm_loadu_ps(ptr1);                                                  \
 679      jy1            = _mm_loadu_ps(ptr2);                                                  \
 680      _tmp2          = _mm_loadu_ps(ptr1+4);                                                \
 681      jz2            = _mm_loadu_ps(ptr2+4);                                                \
 682      jz3            = _mm_load_ss(ptr1+8);                                                 \
 683      _tmp3          = _mm_load_ss(ptr2+8);                                                 \
 684      jx1            = _mm_unpacklo_ps(_tmp1,jy1);                                          \
 685      jz1            = _mm_unpackhi_ps(_tmp1,jy1);                                          \
 686      jy2            = _mm_unpacklo_ps(_tmp2,jz2);                                          \
 687      jx3            = _mm_unpackhi_ps(_tmp2,jz2);                                          \
 688      jy1            = _mm_movehl_ps(jx1,jx1);                                              \
 689      jx2            = _mm_movehl_ps(jz1,jz1);                                              \
 690      jz2            = _mm_movehl_ps(jy2,jy2);                                              \
 691      jy3            = _mm_movehl_ps(jx3,jx3);                                              \
 692      jz3            = _mm_unpacklo_ps(jz3,_tmp3);                                          \
 693 }
 694
 695
 696 #define GMX_MM_LOAD_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
 697      __m128 _tmp1, _tmp2, _tmp3,_tmp4;                                                                 \
 698          _tmp1          = _mm_loadu_ps(ptr1);                                                              \
 699      jy1            = _mm_loadu_ps(ptr2);                                                              \
 700      _tmp2          = _mm_loadu_ps(ptr1+4);                                                            \
 701      jz2            = _mm_loadu_ps(ptr2+4);                                                            \
 702      _tmp3          = _mm_loadu_ps(ptr1+8);                                                            \
 703      _tmp4          = _mm_loadu_ps(ptr2+8);                                                            \
 704      jx1            = _mm_unpacklo_ps(_tmp1,jy1);                                                      \
 705      jz1            = _mm_unpackhi_ps(_tmp1,jy1);                                                      \
 706      jy2            = _mm_unpacklo_ps(_tmp2,jz2);                                                      \
 707      jx3            = _mm_unpackhi_ps(_tmp2,jz2);                                                      \
 708      jz3            = _mm_unpacklo_ps(_tmp3,_tmp4);                                                    \
 709      jy4            = _mm_unpackhi_ps(_tmp3,_tmp4);                                                    \
 710      jy1            = _mm_movehl_ps(jx1,jx1);                                                          \
 711      jx2            = _mm_movehl_ps(jz1,jz1);                                                          \
 712      jz2            = _mm_movehl_ps(jy2,jy2);                                                          \
 713      jy3            = _mm_movehl_ps(jx3,jx3);                                                          \
 714      jx4            = _mm_movehl_ps(jz3,jz3);                                                          \
 715      jz4            = _mm_movehl_ps(jy4,jy4);                                                          \
 716 }
 717
 718
 719 #define GMX_MM_LOAD_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
 720      __m128 _tmp1,_tmp3,_tmp4;                                         \
 721          jx1            = _mm_load_ss(ptr1);                               \
 722      jy1            = _mm_load_ss(ptr2);                               \
 723      jz1            = _mm_load_ss(ptr3);                               \
 724          jx1            = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1));             \
 725      jy1            = _mm_loadh_pi(jy1,(__m64 *)(ptr2+1));             \
 726      jz1            = _mm_loadh_pi(jz1,(__m64 *)(ptr3+1));             \
 727      _tmp1          = _mm_unpacklo_ps(jx1,jy1);                        \
 728      _tmp3          = _mm_unpackhi_ps(jx1,jy1);                        \
 729      _tmp4          = _mm_unpackhi_ps(jz1,jz1);                        \
 730      jx1            = _mm_movelh_ps(_tmp1,jz1);                        \
 731      jy1            = _mm_movelh_ps(_tmp3,_tmp4);                      \
 732      jz1            = _mm_movehl_ps(_tmp4,_tmp3);                      \
 733 }
 734
 735
 736 #define GMX_MM_LOAD_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
 737      __m128 _tmp1, _tmp2;                                                           \
 738          jx1            = _mm_loadu_ps(ptr1);                                           \
 739      jy1            = _mm_loadu_ps(ptr2);                                           \
 740      jz1            = _mm_loadu_ps(ptr3);                                           \
 741      jx2            = _mm_setzero_ps();                                             \
 742      _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2);                                            \
 743      _tmp1          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));             \
 744      jz2            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4));             \
 745      _tmp2          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4));             \
 746      _tmp1          = _mm_unpacklo_ps(_tmp1,_tmp2);                                 \
 747      jz2            = _mm_unpacklo_ps(jz2,_mm_setzero_ps());                        \
 748      jy2            = _mm_unpacklo_ps(_tmp1,jz2);                                   \
 749      jz2            = _mm_unpackhi_ps(_tmp1,jz2);                                   \
 750 }
 751
 752
 753 #define GMX_MM_LOAD_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
 754      __m128 _tmp1, _tmp2;                                                                       \
 755          jx1            = _mm_loadu_ps(ptr1);                                                       \
 756      jy1            = _mm_loadu_ps(ptr2);                                                       \
 757      jz1            = _mm_loadu_ps(ptr3);                                                       \
 758      jx2            = _mm_setzero_ps();                                                         \
 759      _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2);                                                        \
 760      jy2            = _mm_loadu_ps(ptr1+4);                                                     \
 761      jz2            = _mm_loadu_ps(ptr2+4);                                                     \
 762      jx3            = _mm_loadu_ps(ptr3+4);                                                     \
 763      jy3            = _mm_setzero_ps();                                                         \
 764      _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3);                                                        \
 765      jz3            = _mm_load_ss(ptr1+8);                                                      \
 766      _tmp1          = _mm_load_ss(ptr2+8);                                                      \
 767      _tmp2          = _mm_load_ss(ptr3+8);                                                      \
 768      jz3            = _mm_unpacklo_ps(jz3,_tmp2);                                               \
 769      _tmp1          = _mm_unpacklo_ps(_tmp1,_mm_setzero_ps());                                  \
 770      jz3            = _mm_unpacklo_ps(jz3,_tmp1);                                               \
 771 }
 772
 773
 774 #define GMX_MM_LOAD_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
 775          jx1            = _mm_loadu_ps(ptr1);                                                                   \
 776      jy1            = _mm_loadu_ps(ptr2);                                                                   \
 777      jz1            = _mm_loadu_ps(ptr3);                                                                   \
 778      jx2            = _mm_setzero_ps();                                                                     \
 779      _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2);                                                                    \
 780      jy2            = _mm_loadu_ps(ptr1+4);                                                                 \
 781      jz2            = _mm_loadu_ps(ptr2+4);                                                                 \
 782      jx3            = _mm_loadu_ps(ptr3+4);                                                                 \
 783      jy3            = _mm_setzero_ps();                                                                     \
 784      _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3);                                                                    \
 785      jz3            = _mm_loadu_ps(ptr1+8);                                                                 \
 786      jx4            = _mm_loadu_ps(ptr2+8);                                                                 \
 787      jy4            = _mm_loadu_ps(ptr3+8);                                                                 \
 788      jz4            = _mm_setzero_ps();                                                                     \
 789      _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4);                                                                    \
 790 }
 791
 792
 793
 794 #define GMX_MM_LOAD_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) {  \
 795      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5;                                   \
 796          jx1            = _mm_load_ss(ptr1);                                     \
 797      _tmp1          = _mm_load_ss(ptr2);                                     \
 798      jy1            = _mm_load_ss(ptr3);                                     \
 799      jz1            = _mm_load_ss(ptr4);                                     \
 800          jx1            = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1));                   \
 801      _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2+1));                 \
 802      jy1            = _mm_loadh_pi(jy1,(__m64 *)(ptr3+1));                   \
 803      jz1            = _mm_loadh_pi(jz1,(__m64 *)(ptr4+1));                   \
 804      _tmp2          = _mm_unpacklo_ps(jx1,_tmp1);                            \
 805      _tmp3          = _mm_unpacklo_ps(jy1,jz1);                              \
 806      _tmp4          = _mm_unpackhi_ps(jx1,_tmp1);                            \
 807      _tmp5          = _mm_unpackhi_ps(jy1,jz1);                              \
 808      jx1            = _mm_movelh_ps(_tmp2,_tmp3);                            \
 809      jy1            = _mm_movelh_ps(_tmp4,_tmp5);                            \
 810      jz1            = _mm_movehl_ps(_tmp5,_tmp4);                            \
 811 }
 812
 813
 814 #define GMX_MM_LOAD_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
 815      __m128 _tmp1, _tmp2;                                                                \
 816          jx1            = _mm_loadu_ps(ptr1);                                                \
 817      jy1            = _mm_loadu_ps(ptr2);                                                \
 818      jz1            = _mm_loadu_ps(ptr3);                                                \
 819      jx2            = _mm_loadu_ps(ptr4);                                                \
 820      _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2);                                                 \
 821      jy2            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));                  \
 822      jz2            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4));                  \
 823      _tmp1          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4));                  \
 824      _tmp2          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr4+4));                  \
 825      _tmp1          = _mm_unpacklo_ps(jy2,_tmp1);                                        \
 826      _tmp2          = _mm_unpacklo_ps(jz2,_tmp2);                                        \
 827      jy2            = _mm_unpacklo_ps(_tmp1,_tmp2);                                      \
 828      jz2            = _mm_unpackhi_ps(_tmp1,_tmp2);                                      \
 829 }
 830
 831
 832 #define GMX_MM_LOAD_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
 833      __m128 _tmp1, _tmp2, _tmp3;                                                                     \
 834          jx1            = _mm_loadu_ps(ptr1);                                                            \
 835      jy1            = _mm_loadu_ps(ptr2);                                                            \
 836      jz1            = _mm_loadu_ps(ptr3);                                                            \
 837      jx2            = _mm_loadu_ps(ptr4);                                                            \
 838      _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2);                                                             \
 839      jy2            = _mm_loadu_ps(ptr1+4);                                                          \
 840      jz2            = _mm_loadu_ps(ptr2+4);                                                          \
 841      jx3            = _mm_loadu_ps(ptr3+4);                                                          \
 842      jy3            = _mm_loadu_ps(ptr4+4);                                                          \
 843      _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3);                                                             \
 844      jz3            = _mm_load_ss(ptr1+8);                                                           \
 845      _tmp1          = _mm_load_ss(ptr2+8);                                                           \
 846      _tmp2          = _mm_load_ss(ptr3+8);                                                           \
 847      _tmp3          = _mm_load_ss(ptr4+8);                                                           \
 848      jz3            = _mm_unpacklo_ps(jz3,_tmp2);                                                    \
 849      _tmp1          = _mm_unpacklo_ps(_tmp1,_tmp3);                                                  \
 850      jz3            = _mm_unpacklo_ps(jz3,_tmp1);                                                    \
 851 }
 852
 853
 854 #define GMX_MM_LOAD_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
 855          jx1            = _mm_loadu_ps(ptr1);                                                                        \
 856      jy1            = _mm_loadu_ps(ptr2);                                                                        \
 857      jz1            = _mm_loadu_ps(ptr3);                                                                        \
 858      jx2            = _mm_loadu_ps(ptr4);                                                                        \
 859      _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2);                                                                         \
 860      jy2            = _mm_loadu_ps(ptr1+4);                                                                      \
 861      jz2            = _mm_loadu_ps(ptr2+4);                                                                      \
 862      jx3            = _mm_loadu_ps(ptr3+4);                                                                      \
 863      jy3            = _mm_loadu_ps(ptr4+4);                                                                      \
 864      _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3);                                                                         \
 865      jz3            = _mm_loadu_ps(ptr1+8);                                                                      \
 866      jx4            = _mm_loadu_ps(ptr2+8);                                                                      \
 867      jy4            = _mm_loadu_ps(ptr3+8);                                                                      \
 868      jz4            = _mm_loadu_ps(ptr4+8);                                                                      \
 869      _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4);                                                                         \
 870 }
 871
 872
 873 /* Routines to increment rvecs in memory, typically use for j particle force updates */
 874 #define GMX_MM_INCREMENT_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) {      \
 875      __m128 _tmp1;                                                    \
 876      jy1            = _mm_unpacklo_ps(jy1,jz1);                       \
 877      jx1            = _mm_movelh_ps(jx1,jy1);                         \
 878      _tmp1          = _mm_load_ss(ptr1);                              \
 879      _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1));          \
 880      _tmp1          = _mm_add_ps(_tmp1,jx1);                          \
 881      _mm_store_ss(ptr1,_tmp1);                                        \
 882      _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1);                          \
 883 }
 884
 885
 886 #define GMX_MM_INCREMENT_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
 887      __m128 _tmp1, _tmp2;                                                     \
 888      _tmp1          = _mm_loadu_ps(ptr1);                                     \
 889      _tmp2          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));       \
 890      jx1            = _mm_unpacklo_ps(jx1,jy1);                               \
 891      jz1            = _mm_unpacklo_ps(jz1,jx2);                               \
 892      jy2            = _mm_unpacklo_ps(jy2,jz2);                               \
 893      jx1            = _mm_movelh_ps(jx1,jz1);                                 \
 894      _tmp1          = _mm_add_ps(_tmp1,jx1);                                  \
 895      _tmp2          = _mm_add_ps(_tmp2,jy2);                                  \
 896      _mm_storeu_ps(ptr1,_tmp1);                                               \
 897      _mm_storel_pi((__m64 *)(ptr1+4),_tmp2);                                  \
 898 }
 899
 900
 901 #define GMX_MM_INCREMENT_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
 902      __m128 _tmp1, _tmp2, _tmp3;                                                          \
 903      _tmp1          = _mm_loadu_ps(ptr1);                                                 \
 904      _tmp2          = _mm_loadu_ps(ptr1+4);                                               \
 905      _tmp3          = _mm_load_ss(ptr1+8);                                                \
 906      jx1            = _mm_unpacklo_ps(jx1,jy1);                                           \
 907      jz1            = _mm_unpacklo_ps(jz1,jx2);                                           \
 908      jy2            = _mm_unpacklo_ps(jy2,jz2);                                           \
 909      jx3            = _mm_unpacklo_ps(jx3,jy3);                                           \
 910      jx1            = _mm_movelh_ps(jx1,jz1);                                             \
 911      jy2            = _mm_movelh_ps(jy2,jx3);                                             \
 912      _tmp1           = _mm_add_ps(_tmp1,jx1);                                             \
 913      _tmp2           = _mm_add_ps(_tmp2,jy2);                                             \
 914      _tmp3           = _mm_add_ss(_tmp3,jz3);                                             \
 915      _mm_storeu_ps(ptr1,_tmp1);                                                           \
 916      _mm_storeu_ps(ptr1+4,_tmp2);                                                         \
 917      _mm_store_ss(ptr1+8,_tmp3);                                                          \
 918 }
 919
 920
 921 #define GMX_MM_INCREMENT_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
 922      __m128 _tmp1, _tmp2, _tmp3;                                                                      \
 923      _tmp1          = _mm_loadu_ps(ptr1);                                                             \
 924      _tmp2          = _mm_loadu_ps(ptr1+4);                                                           \
 925      _tmp3          = _mm_loadu_ps(ptr1+8);                                                           \
 926      jx1            = _mm_unpacklo_ps(jx1,jy1);                                                       \
 927      jz1            = _mm_unpacklo_ps(jz1,jx2);                                                       \
 928      jy2            = _mm_unpacklo_ps(jy2,jz2);                                                       \
 929      jx3            = _mm_unpacklo_ps(jx3,jy3);                                                       \
 930      jz3            = _mm_unpacklo_ps(jz3,jx4);                                                       \
 931      jy4            = _mm_unpacklo_ps(jy4,jz4);                                                       \
 932      jx1            = _mm_movelh_ps(jx1,jz1);                                                         \
 933      jy2            = _mm_movelh_ps(jy2,jx3);                                                         \
 934      jz3            = _mm_movelh_ps(jz3,jy4);                                                         \
 935      _tmp1          = _mm_add_ps(_tmp1,jx1);                                                          \
 936      _tmp2          = _mm_add_ps(_tmp2,jy2);                                                          \
 937      _tmp3          = _mm_add_ps(_tmp3,jz3);                                                          \
 938      _mm_storeu_ps(ptr1,_tmp1);                                                                       \
 939      _mm_storeu_ps(ptr1+4,_tmp2);                                                                     \
 940      _mm_storeu_ps(ptr1+8,_tmp3);                                                                     \
 941 }
 942
 943
 944 #define GMX_MM_INCREMENT_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) {        \
 945      __m128 _tmp1,_tmp2,_tmp3,_tmp4;                                          \
 946      _tmp1          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1));         \
 947      _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2));                    \
 948      _tmp2          = _mm_load_ss(ptr1+2);                                    \
 949      _tmp3          = _mm_load_ss(ptr2+2);                                    \
 950      jx1            = _mm_unpacklo_ps(jx1,jy1);                               \
 951      _tmp4          = _mm_shuffle_ps(jz1,jz1,_MM_SHUFFLE(0,0,0,1));           \
 952      _tmp1          = _mm_add_ps(_tmp1,jx1);                                  \
 953      _mm_storel_pi((__m64 *)(ptr1),_tmp1);                                    \
 954      _mm_storeh_pi((__m64 *)(ptr2),_tmp1);                                    \
 955      _mm_store_ss(ptr1+2,_mm_add_ss(_tmp2,jz1));                              \
 956          _mm_store_ss(ptr2+2,_mm_add_ss(_tmp3,_tmp4));                            \
 957 }
 958
 959
 960 #define GMX_MM_INCREMENT_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) {  \
 961      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5;                                           \
 962      _tmp1          = _mm_loadu_ps(ptr1);                                            \
 963      _tmp2          = _mm_loadu_ps(ptr2);                                            \
 964      _tmp3          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));              \
 965      _tmp3          = _mm_loadh_pi(_tmp3,(__m64 *)(ptr2+4));                         \
 966      jx1            = _mm_unpacklo_ps(jx1,jy1);                                      \
 967      jz1            = _mm_unpacklo_ps(jz1,jx2);                                      \
 968      jy2            = _mm_unpacklo_ps(jy2,jz2);                                      \
 969      _tmp4          = _mm_movelh_ps(jx1,jz1);                                        \
 970      _tmp5          = _mm_movehl_ps(jz1,jx1);                                        \
 971      _tmp1          = _mm_add_ps(_tmp1,_tmp4);                                       \
 972      _tmp2          = _mm_add_ps(_tmp2,_tmp5);                                       \
 973      _tmp3          = _mm_add_ps(_tmp3,jy2);                                         \
 974      _mm_storeu_ps(ptr1,_tmp1);                                                      \
 975      _mm_storeu_ps(ptr2,_tmp2);                                                      \
 976      _mm_storel_pi((__m64 *)(ptr1+4),_tmp3);                                         \
 977          _mm_storeh_pi((__m64 *)(ptr2+4),_tmp3);                                         \
 978 }
 979
 980
 981 #define GMX_MM_INCREMENT_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
 982      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11;                \
 983      _tmp1          = _mm_loadu_ps(ptr1);                                                       \
 984      _tmp2          = _mm_loadu_ps(ptr1+4);                                                     \
 985      _tmp3          = _mm_load_ss(ptr1+8);                                                      \
 986      _tmp4          = _mm_loadu_ps(ptr2);                                                       \
 987      _tmp5          = _mm_loadu_ps(ptr2+4);                                                     \
 988      _tmp6          = _mm_load_ss(ptr2+8);                                                      \
 989      jx1            = _mm_unpacklo_ps(jx1,jy1);                                                 \
 990      jz1            = _mm_unpacklo_ps(jz1,jx2);                                                 \
 991      jy2            = _mm_unpacklo_ps(jy2,jz2);                                                 \
 992      jx3            = _mm_unpacklo_ps(jx3,jy3);                                                 \
 993      _tmp7          = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1));                             \
 994      _tmp8          = _mm_movelh_ps(jx1,jz1);                                                   \
 995      _tmp9          = _mm_movehl_ps(jz1,jx1);                                                   \
 996      _tmp10         = _mm_movelh_ps(jy2,jx3);                                                   \
 997      _tmp11         = _mm_movehl_ps(jx3,jy2);                                                   \
 998      _tmp1          = _mm_add_ps(_tmp1,_tmp8);                                                  \
 999      _tmp2          = _mm_add_ps(_tmp2,_tmp10);                                                 \
1000      _tmp3          = _mm_add_ss(_tmp3,jz3);                                                    \
1001      _tmp4          = _mm_add_ps(_tmp4,_tmp9);                                                  \
1002      _tmp5          = _mm_add_ps(_tmp5,_tmp11);                                                 \
1003      _tmp6          = _mm_add_ss(_tmp6,_tmp7);                                                  \
1004      _mm_storeu_ps(ptr1,_tmp1);                                                                 \
1005      _mm_storeu_ps(ptr1+4,_tmp2);                                                               \
1006      _mm_store_ss(ptr1+8,_tmp3);                                                                \
1007      _mm_storeu_ps(ptr2,_tmp4);                                                                 \
1008      _mm_storeu_ps(ptr2+4,_tmp5);                                                               \
1009      _mm_store_ss(ptr2+8,_tmp6);                                                                \
1010 }
1011
1012
1013 #define GMX_MM_INCREMENT_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1014      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13;              \
1015      _tmp1          = _mm_loadu_ps(ptr1);                                                                   \
1016      _tmp2          = _mm_loadu_ps(ptr1+4);                                                                 \
1017      _tmp3          = _mm_loadu_ps(ptr1+8);                                                                 \
1018      _tmp4          = _mm_loadu_ps(ptr2);                                                                   \
1019      _tmp5          = _mm_loadu_ps(ptr2+4);                                                                 \
1020      _tmp6          = _mm_loadu_ps(ptr2+8);                                                                 \
1021      jx1            = _mm_unpacklo_ps(jx1,jy1);                                                             \
1022      jz1            = _mm_unpacklo_ps(jz1,jx2);                                                             \
1023      jy2            = _mm_unpacklo_ps(jy2,jz2);                                                             \
1024      jx3            = _mm_unpacklo_ps(jx3,jy3);                                                             \
1025      jz3            = _mm_unpacklo_ps(jz3,jx4);                                                             \
1026      jy4            = _mm_unpacklo_ps(jy4,jz4);                                                             \
1027      _tmp8          = _mm_movelh_ps(jx1,jz1);                                                               \
1028      _tmp9          = _mm_movehl_ps(jz1,jx1);                                                               \
1029      _tmp10         = _mm_movelh_ps(jy2,jx3);                                                               \
1030      _tmp11         = _mm_movehl_ps(jx3,jy2);                                                               \
1031      _tmp12         = _mm_movelh_ps(jz3,jy4);                                                               \
1032      _tmp13         = _mm_movehl_ps(jy4,jz3);                                                               \
1033      _tmp1          = _mm_add_ps(_tmp1,_tmp8);                                                              \
1034      _tmp2          = _mm_add_ps(_tmp2,_tmp10);                                                             \
1035      _tmp3          = _mm_add_ps(_tmp3,_tmp12);                                                             \
1036      _tmp4          = _mm_add_ps(_tmp4,_tmp9);                                                              \
1037      _tmp5          = _mm_add_ps(_tmp5,_tmp11);                                                             \
1038      _tmp6          = _mm_add_ps(_tmp6,_tmp13);                                                             \
1039      _mm_storeu_ps(ptr1,_tmp1);                                                                             \
1040      _mm_storeu_ps(ptr1+4,_tmp2);                                                                           \
1041      _mm_storeu_ps(ptr1+8,_tmp3);                                                                           \
1042      _mm_storeu_ps(ptr2,_tmp4);                                                                             \
1043      _mm_storeu_ps(ptr2+4,_tmp5);                                                                           \
1044      _mm_storeu_ps(ptr2+8,_tmp6);                                                                           \
1045 }
1046
1047
1048 #define GMX_MM_INCREMENT_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) {   \
1049      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7;                        \
1050      _tmp1          = _mm_load_ss(ptr1);                                      \
1051      _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1));                  \
1052      _tmp2          = _mm_load_ss(ptr2);                                      \
1053      _tmp2          = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1));                  \
1054      _tmp3          = _mm_load_ss(ptr3);                                      \
1055      _tmp3          = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1));                  \
1056      _tmp4          = _mm_unpacklo_ps(jy1,jz1);                               \
1057      _tmp5          = _mm_unpackhi_ps(jy1,jz1);                               \
1058      _tmp6          = _mm_shuffle_ps(jx1,_tmp4,_MM_SHUFFLE(3,2,0,1));         \
1059      _tmp7          = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2));           \
1060      jx1            = _mm_movelh_ps(jx1,_tmp4);                               \
1061      _tmp7          = _mm_movelh_ps(_tmp7,_tmp5);                             \
1062      _tmp1          = _mm_add_ps(_tmp1,jx1);                                  \
1063      _tmp2          = _mm_add_ps(_tmp2,_tmp6);                                \
1064      _tmp3          = _mm_add_ps(_tmp3,_tmp7);                                \
1065      _mm_store_ss(ptr1,_tmp1);                                                \
1066      _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1);                                  \
1067      _mm_store_ss(ptr2,_tmp2);                                                \
1068      _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2);                                  \
1069      _mm_store_ss(ptr3,_tmp3);                                                \
1070      _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3);                                  \
1071 }
1072
1073
1074 #define GMX_MM_INCREMENT_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
1075      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10;                \
1076      _tmp1          = _mm_loadu_ps(ptr1);                                                \
1077      _tmp2          = _mm_loadu_ps(ptr2);                                                \
1078      _tmp3          = _mm_loadu_ps(ptr3);                                                \
1079      _tmp4          = _mm_loadl_pi(_tmp4,(__m64 *)(ptr1+4));                             \
1080      _tmp4          = _mm_loadh_pi(_tmp4,(__m64 *)(ptr2+4));                             \
1081      _tmp5          = _mm_loadl_pi(_tmp5,(__m64 *)(ptr3+4));                             \
1082      _tmp6          = _mm_unpackhi_ps(jx1,jy1);                                          \
1083          jx1            = _mm_unpacklo_ps(jx1,jy1);                                          \
1084      _tmp7          = _mm_unpackhi_ps(jz1,jx2);                                          \
1085      jz1            = _mm_unpacklo_ps(jz1,jx2);                                          \
1086      _tmp8          = _mm_unpackhi_ps(jy2,jz2);                                          \
1087      jy2            = _mm_unpacklo_ps(jy2,jz2);                                          \
1088      _tmp9          = _mm_movelh_ps(jx1,jz1);                                            \
1089      _tmp10         = _mm_movehl_ps(jz1,jx1);                                            \
1090      _tmp6          = _mm_movelh_ps(_tmp6,_tmp7);                                        \
1091      _tmp1          = _mm_add_ps(_tmp1,_tmp9);                                           \
1092      _tmp2          = _mm_add_ps(_tmp2,_tmp10);                                          \
1093      _tmp3          = _mm_add_ps(_tmp3,_tmp6);                                           \
1094      _tmp4          = _mm_add_ps(_tmp4,jy2);                                             \
1095      _tmp5          = _mm_add_ps(_tmp5,_tmp8);                                           \
1096      _mm_storeu_ps(ptr1,_tmp1);                                                          \
1097      _mm_storeu_ps(ptr2,_tmp2);                                                          \
1098      _mm_storeu_ps(ptr3,_tmp3);                                                          \
1099      _mm_storel_pi((__m64 *)(ptr1+4),_tmp4);                                             \
1100      _mm_storeh_pi((__m64 *)(ptr2+4),_tmp4);                                             \
1101          _mm_storel_pi((__m64 *)(ptr3+4),_tmp5);                                             \
1102 }
1103
1104
1105 #define GMX_MM_INCREMENT_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1106      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10;                            \
1107      __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19;                          \
1108      _tmp1          = _mm_loadu_ps(ptr1);                                                            \
1109      _tmp2          = _mm_loadu_ps(ptr1+4);                                                          \
1110      _tmp3          = _mm_load_ss(ptr1+8);                                                           \
1111      _tmp4          = _mm_loadu_ps(ptr2);                                                            \
1112      _tmp5          = _mm_loadu_ps(ptr2+4);                                                          \
1113      _tmp6          = _mm_load_ss(ptr2+8);                                                           \
1114      _tmp7          = _mm_loadu_ps(ptr3);                                                            \
1115      _tmp8          = _mm_loadu_ps(ptr3+4);                                                          \
1116      _tmp9          = _mm_load_ss(ptr3+8);                                                           \
1117      _tmp10         = _mm_unpackhi_ps(jx1,jy1);                                                      \
1118      jx1            = _mm_unpacklo_ps(jx1,jy1);                                                      \
1119      _tmp11         = _mm_unpackhi_ps(jz1,jx2);                                                      \
1120      jz1            = _mm_unpacklo_ps(jz1,jx2);                                                      \
1121      _tmp12         = _mm_unpackhi_ps(jy2,jz2);                                                      \
1122      jy2            = _mm_unpacklo_ps(jy2,jz2);                                                      \
1123      _tmp13         = _mm_unpackhi_ps(jx3,jy3);                                                      \
1124      jx3            = _mm_unpacklo_ps(jx3,jy3);                                                      \
1125      _tmp14         = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1));                                  \
1126      _tmp15         = _mm_movehl_ps(jz3,jz3);                                                        \
1127      _tmp16         = _mm_movelh_ps(jx1,jz1);                                                        \
1128      _tmp17         = _mm_movehl_ps(jz1,jx1);                                                        \
1129      _tmp10         = _mm_movelh_ps(_tmp10,_tmp11);                                                  \
1130      _tmp18         = _mm_movelh_ps(jy2,jx3);                                                        \
1131      _tmp19         = _mm_movehl_ps(jx3,jy2);                                                        \
1132      _tmp12         = _mm_movelh_ps(_tmp12,_tmp13);                                                  \
1133      _tmp1          = _mm_add_ps(_tmp1,_tmp16);                                                      \
1134      _tmp2          = _mm_add_ps(_tmp2,_tmp18);                                                      \
1135      _tmp3          = _mm_add_ss(_tmp3,jz3);                                                         \
1136      _tmp4          = _mm_add_ps(_tmp4,_tmp17);                                                      \
1137      _tmp5          = _mm_add_ps(_tmp5,_tmp19);                                                      \
1138      _tmp6          = _mm_add_ss(_tmp6,_tmp14);                                                      \
1139      _tmp7          = _mm_add_ps(_tmp7,_tmp10);                                                      \
1140      _tmp8          = _mm_add_ps(_tmp8,_tmp12);                                                      \
1141      _tmp9          = _mm_add_ss(_tmp9,_tmp15);                                                      \
1142      _mm_storeu_ps(ptr1,_tmp1);                                                                      \
1143      _mm_storeu_ps(ptr1+4,_tmp2);                                                                    \
1144      _mm_store_ss(ptr1+8,_tmp3);                                                                     \
1145      _mm_storeu_ps(ptr2,_tmp4);                                                                      \
1146      _mm_storeu_ps(ptr2+4,_tmp5);                                                                    \
1147      _mm_store_ss(ptr2+8,_tmp6);                                                                     \
1148      _mm_storeu_ps(ptr3,_tmp7);                                                                      \
1149      _mm_storeu_ps(ptr3+4,_tmp8);                                                                    \
1150      _mm_store_ss(ptr3+8,_tmp9);                                                                     \
1151 }
1152
1153
1154 #define GMX_MM_INCREMENT_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1155      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11;                                 \
1156      __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21;                               \
1157      _tmp1          = _mm_loadu_ps(ptr1);                                                                        \
1158      _tmp2          = _mm_loadu_ps(ptr1+4);                                                                      \
1159      _tmp3          = _mm_loadu_ps(ptr1+8);                                                                      \
1160      _tmp4          = _mm_loadu_ps(ptr2);                                                                        \
1161      _tmp5          = _mm_loadu_ps(ptr2+4);                                                                      \
1162      _tmp6          = _mm_loadu_ps(ptr2+8);                                                                      \
1163      _tmp7          = _mm_loadu_ps(ptr3);                                                                        \
1164      _tmp8          = _mm_loadu_ps(ptr3+4);                                                                      \
1165      _tmp9          = _mm_loadu_ps(ptr3+8);                                                                      \
1166      _tmp10         = _mm_unpackhi_ps(jx1,jy1);                                                                  \
1167      jx1            = _mm_unpacklo_ps(jx1,jy1);                                                                  \
1168      _tmp11         = _mm_unpackhi_ps(jz1,jx2);                                                                  \
1169      jz1            = _mm_unpacklo_ps(jz1,jx2);                                                                  \
1170      _tmp12         = _mm_unpackhi_ps(jy2,jz2);                                                                  \
1171      jy2            = _mm_unpacklo_ps(jy2,jz2);                                                                  \
1172      _tmp13         = _mm_unpackhi_ps(jx3,jy3);                                                                  \
1173      jx3            = _mm_unpacklo_ps(jx3,jy3);                                                                  \
1174      _tmp14         = _mm_unpackhi_ps(jz3,jx4);                                                                  \
1175      jz3            = _mm_unpacklo_ps(jz3,jx4);                                                                  \
1176      _tmp15         = _mm_unpackhi_ps(jy4,jz4);                                                                  \
1177      jy4            = _mm_unpacklo_ps(jy4,jz4);                                                                  \
1178      _tmp16         = _mm_movelh_ps(jx1,jz1);                                                                    \
1179      _tmp17         = _mm_movehl_ps(jz1,jx1);                                                                    \
1180      _tmp10         = _mm_movelh_ps(_tmp10,_tmp11);                                                              \
1181      _tmp18         = _mm_movelh_ps(jy2,jx3);                                                                    \
1182      _tmp19         = _mm_movehl_ps(jx3,jy2);                                                                    \
1183      _tmp12         = _mm_movelh_ps(_tmp12,_tmp13);                                                              \
1184      _tmp20         = _mm_movelh_ps(jz3,jy4);                                                                    \
1185      _tmp21         = _mm_movehl_ps(jy4,jz3);                                                                    \
1186      _tmp14         = _mm_movelh_ps(_tmp14,_tmp15);                                                              \
1187      _tmp1          = _mm_add_ps(_tmp1,_tmp16);                                                                  \
1188      _tmp2          = _mm_add_ps(_tmp2,_tmp18);                                                                  \
1189      _tmp3          = _mm_add_ps(_tmp3,_tmp20);                                                                  \
1190      _tmp4          = _mm_add_ps(_tmp4,_tmp17);                                                                  \
1191      _tmp5          = _mm_add_ps(_tmp5,_tmp19);                                                                  \
1192      _tmp6          = _mm_add_ps(_tmp6,_tmp21);                                                                  \
1193      _tmp7          = _mm_add_ps(_tmp7,_tmp10);                                                                  \
1194      _tmp8          = _mm_add_ps(_tmp8,_tmp12);                                                                  \
1195      _tmp9          = _mm_add_ps(_tmp9,_tmp14);                                                                  \
1196      _mm_storeu_ps(ptr1,_tmp1);                                                                                  \
1197      _mm_storeu_ps(ptr1+4,_tmp2);                                                                                \
1198      _mm_storeu_ps(ptr1+8,_tmp3);                                                                                \
1199      _mm_storeu_ps(ptr2,_tmp4);                                                                                  \
1200      _mm_storeu_ps(ptr2+4,_tmp5);                                                                                \
1201      _mm_storeu_ps(ptr2+8,_tmp6);                                                                                \
1202      _mm_storeu_ps(ptr3,_tmp7);                                                                                  \
1203      _mm_storeu_ps(ptr3+4,_tmp8);                                                                                \
1204      _mm_storeu_ps(ptr3+8,_tmp9);                                                                                \
1205 }
1206
1207
1208
1209 #define GMX_MM_INCREMENT_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
1210      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10;        \
1211      _tmp1          = _mm_load_ss(ptr1);                                         \
1212      _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1));                     \
1213      _tmp2          = _mm_load_ss(ptr2);                                         \
1214      _tmp2          = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1));                     \
1215      _tmp3          = _mm_load_ss(ptr3);                                         \
1216      _tmp3          = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1));                     \
1217      _tmp4          = _mm_load_ss(ptr4);                                         \
1218      _tmp4          = _mm_loadh_pi(_tmp4,(__m64 *)(ptr4+1));                     \
1219      _tmp5          = _mm_unpacklo_ps(jy1,jz1);                                  \
1220      _tmp6          = _mm_unpackhi_ps(jy1,jz1);                                  \
1221      _tmp7          = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(1,0,0,0));            \
1222      _tmp8          = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(3,2,0,1));            \
1223      _tmp9          = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(1,0,0,2));            \
1224      _tmp10         = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(3,2,0,3));            \
1225      _tmp1          = _mm_add_ps(_tmp1,_tmp7);                                   \
1226      _tmp2          = _mm_add_ps(_tmp2,_tmp8);                                   \
1227      _tmp3          = _mm_add_ps(_tmp3,_tmp9);                                   \
1228      _tmp4          = _mm_add_ps(_tmp4,_tmp10);                                  \
1229      _mm_store_ss(ptr1,_tmp1);                                                   \
1230      _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1);                                     \
1231      _mm_store_ss(ptr2,_tmp2);                                                   \
1232      _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2);                                     \
1233      _mm_store_ss(ptr3,_tmp3);                                                   \
1234      _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3);                                     \
1235      _mm_store_ss(ptr4,_tmp4);                                                   \
1236      _mm_storeh_pi((__m64 *)(ptr4+1),_tmp4);                                     \
1237 }
1238
1239
1240 #define GMX_MM_INCREMENT_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) {  \
1241      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1242      _tmp1          = _mm_loadu_ps(ptr1);                                                      \
1243      _tmp2          = _mm_loadu_ps(ptr2);                                                      \
1244      _tmp3          = _mm_loadu_ps(ptr3);                                                      \
1245      _tmp4          = _mm_loadu_ps(ptr4);                                                      \
1246      _tmp5          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));                        \
1247      _tmp5          = _mm_loadh_pi(_tmp5,(__m64 *)(ptr2+4));                                   \
1248      _tmp6          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4));                        \
1249      _tmp6          = _mm_loadh_pi(_tmp6,(__m64 *)(ptr4+4));                                   \
1250      _tmp7          = _mm_unpackhi_ps(jx1,jy1);                                                \
1251          jx1            = _mm_unpacklo_ps(jx1,jy1);                                                \
1252      _tmp8          = _mm_unpackhi_ps(jz1,jx2);                                                \
1253      jz1            = _mm_unpacklo_ps(jz1,jx2);                                                \
1254      _tmp9          = _mm_unpackhi_ps(jy2,jz2);                                                \
1255      jy2            = _mm_unpacklo_ps(jy2,jz2);                                                \
1256      _tmp10         = _mm_movelh_ps(jx1,jz1);                                                  \
1257      _tmp11         = _mm_movehl_ps(jz1,jx1);                                                  \
1258      _tmp12         = _mm_movelh_ps(_tmp7,_tmp8);                                              \
1259      _tmp13         = _mm_movehl_ps(_tmp8,_tmp7);                                              \
1260      _tmp1          = _mm_add_ps(_tmp1,_tmp10);                                                \
1261      _tmp2          = _mm_add_ps(_tmp2,_tmp11);                                                \
1262      _tmp3          = _mm_add_ps(_tmp3,_tmp12);                                                \
1263      _tmp4          = _mm_add_ps(_tmp4,_tmp13);                                                \
1264      _tmp5          = _mm_add_ps(_tmp5,jy2);                                                   \
1265      _tmp6          = _mm_add_ps(_tmp6,_tmp9);                                                 \
1266      _mm_storeu_ps(ptr1,_tmp1);                                                                \
1267      _mm_storeu_ps(ptr2,_tmp2);                                                                \
1268      _mm_storeu_ps(ptr3,_tmp3);                                                                \
1269      _mm_storeu_ps(ptr4,_tmp4);                                                                \
1270      _mm_storel_pi((__m64 *)(ptr1+4),_tmp5);                                                   \
1271      _mm_storeh_pi((__m64 *)(ptr2+4),_tmp5);                                                   \
1272          _mm_storel_pi((__m64 *)(ptr3+4),_tmp6);                                                   \
1273          _mm_storeh_pi((__m64 *)(ptr4+4),_tmp6);                                                   \
1274 }
1275
1276
1277 #define GMX_MM_INCREMENT_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1278      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10;                                 \
1279      __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19;                               \
1280      __m128 _tmp20,_tmp21,_tmp22,_tmp23,_tmp24,_tmp25;                                                    \
1281      _tmp1          = _mm_loadu_ps(ptr1);                                                                 \
1282      _tmp2          = _mm_loadu_ps(ptr1+4);                                                               \
1283      _tmp3          = _mm_load_ss(ptr1+8);                                                                \
1284      _tmp4          = _mm_loadu_ps(ptr2);                                                                 \
1285      _tmp5          = _mm_loadu_ps(ptr2+4);                                                               \
1286      _tmp6          = _mm_load_ss(ptr2+8);                                                                \
1287      _tmp7          = _mm_loadu_ps(ptr3);                                                                 \
1288      _tmp8          = _mm_loadu_ps(ptr3+4);                                                               \
1289      _tmp9          = _mm_load_ss(ptr3+8);                                                                \
1290      _tmp10         = _mm_loadu_ps(ptr4);                                                                 \
1291      _tmp11         = _mm_loadu_ps(ptr4+4);                                                               \
1292      _tmp12         = _mm_load_ss(ptr4+8);                                                                \
1293      _tmp13         = _mm_unpackhi_ps(jx1,jy1);                                                           \
1294      jx1            = _mm_unpacklo_ps(jx1,jy1);                                                           \
1295      _tmp14         = _mm_unpackhi_ps(jz1,jx2);                                                           \
1296      jz1            = _mm_unpacklo_ps(jz1,jx2);                                                           \
1297      _tmp15         = _mm_unpackhi_ps(jy2,jz2);                                                           \
1298      jy2            = _mm_unpacklo_ps(jy2,jz2);                                                           \
1299      _tmp16         = _mm_unpackhi_ps(jx3,jy3);                                                           \
1300      jx3            = _mm_unpacklo_ps(jx3,jy3);                                                           \
1301      _tmp17         = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1));                                       \
1302      _tmp18         = _mm_movehl_ps(jz3,jz3);                                                             \
1303      _tmp19         = _mm_shuffle_ps(_tmp18,_tmp18,_MM_SHUFFLE(0,0,0,1));                                 \
1304      _tmp20         = _mm_movelh_ps(jx1,jz1);                                                             \
1305      _tmp21         = _mm_movehl_ps(jz1,jx1);                                                             \
1306      _tmp22         = _mm_movelh_ps(_tmp13,_tmp14);                                                       \
1307      _tmp14         = _mm_movehl_ps(_tmp14,_tmp13);                                                       \
1308      _tmp23         = _mm_movelh_ps(jy2,jx3);                                                             \
1309      _tmp24         = _mm_movehl_ps(jx3,jy2);                                                             \
1310      _tmp25         = _mm_movelh_ps(_tmp15,_tmp16);                                                       \
1311      _tmp16         = _mm_movehl_ps(_tmp16,_tmp15);                                                       \
1312      _tmp1          = _mm_add_ps(_tmp1,_tmp20);                                                           \
1313      _tmp2          = _mm_add_ps(_tmp2,_tmp23);                                                           \
1314      _tmp3          = _mm_add_ss(_tmp3,jz3);                                                              \
1315      _tmp4          = _mm_add_ps(_tmp4,_tmp21);                                                           \
1316      _tmp5          = _mm_add_ps(_tmp5,_tmp24);                                                           \
1317      _tmp6          = _mm_add_ss(_tmp6,_tmp17);                                                           \
1318      _tmp7          = _mm_add_ps(_tmp7,_tmp22);                                                           \
1319      _tmp8          = _mm_add_ps(_tmp8,_tmp25);                                                           \
1320      _tmp9          = _mm_add_ss(_tmp9,_tmp18);                                                           \
1321      _tmp10         = _mm_add_ps(_tmp10,_tmp14);                                                          \
1322      _tmp11         = _mm_add_ps(_tmp11,_tmp16);                                                          \
1323      _tmp12         = _mm_add_ss(_tmp12,_tmp19);                                                          \
1324      _mm_storeu_ps(ptr1,_tmp1);                                                                           \
1325      _mm_storeu_ps(ptr1+4,_tmp2);                                                                         \
1326      _mm_store_ss(ptr1+8,_tmp3);                                                                          \
1327      _mm_storeu_ps(ptr2,_tmp4);                                                                           \
1328      _mm_storeu_ps(ptr2+4,_tmp5);                                                                         \
1329      _mm_store_ss(ptr2+8,_tmp6);                                                                          \
1330      _mm_storeu_ps(ptr3,_tmp7);                                                                           \
1331      _mm_storeu_ps(ptr3+4,_tmp8);                                                                         \
1332      _mm_store_ss(ptr3+8,_tmp9);                                                                          \
1333      _mm_storeu_ps(ptr4,_tmp10);                                                                          \
1334      _mm_storeu_ps(ptr4+4,_tmp11);                                                                        \
1335      _mm_store_ss(ptr4+8,_tmp12);                                                                         \
1336 }
1337
1338
1339 #define GMX_MM_INCREMENT_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1340      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11;                                      \
1341      __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21,_tmp22;                             \
1342      __m128 _tmp23,_tmp24;                                                                                            \
1343      _tmp1          = _mm_loadu_ps(ptr1);                                                                             \
1344      _tmp2          = _mm_loadu_ps(ptr1+4);                                                                           \
1345      _tmp3          = _mm_loadu_ps(ptr1+8);                                                                           \
1346      _tmp4          = _mm_loadu_ps(ptr2);                                                                             \
1347      _tmp5          = _mm_loadu_ps(ptr2+4);                                                                           \
1348      _tmp6          = _mm_loadu_ps(ptr2+8);                                                                           \
1349      _tmp7          = _mm_loadu_ps(ptr3);                                                                             \
1350      _tmp8          = _mm_loadu_ps(ptr3+4);                                                                           \
1351      _tmp9          = _mm_loadu_ps(ptr3+8);                                                                           \
1352      _tmp10         = _mm_loadu_ps(ptr4);                                                                             \
1353      _tmp11         = _mm_loadu_ps(ptr4+4);                                                                           \
1354      _tmp12         = _mm_loadu_ps(ptr4+8);                                                                           \
1355      _tmp13         = _mm_unpackhi_ps(jx1,jy1);                                                                       \
1356      jx1            = _mm_unpacklo_ps(jx1,jy1);                                                                       \
1357      _tmp14         = _mm_unpackhi_ps(jz1,jx2);                                                                       \
1358      jz1            = _mm_unpacklo_ps(jz1,jx2);                                                                       \
1359      _tmp15         = _mm_unpackhi_ps(jy2,jz2);                                                                       \
1360      jy2            = _mm_unpacklo_ps(jy2,jz2);                                                                       \
1361      _tmp16         = _mm_unpackhi_ps(jx3,jy3);                                                                       \
1362      jx3            = _mm_unpacklo_ps(jx3,jy3);                                                                       \
1363      _tmp17         = _mm_unpackhi_ps(jz3,jx4);                                                                       \
1364      jz3            = _mm_unpacklo_ps(jz3,jx4);                                                                       \
1365      _tmp18         = _mm_unpackhi_ps(jy4,jz4);                                                                       \
1366      jy4            = _mm_unpacklo_ps(jy4,jz4);                                                                       \
1367      _tmp19         = _mm_movelh_ps(jx1,jz1);                                                                         \
1368      jz1            = _mm_movehl_ps(jz1,jx1);                                                                         \
1369      _tmp20         = _mm_movelh_ps(_tmp13,_tmp14);                                                                   \
1370      _tmp14         = _mm_movehl_ps(_tmp14,_tmp13);                                                                   \
1371      _tmp21         = _mm_movelh_ps(jy2,jx3);                                                                         \
1372      jx3            = _mm_movehl_ps(jx3,jy2);                                                                         \
1373      _tmp22         = _mm_movelh_ps(_tmp15,_tmp16);                                                                   \
1374      _tmp16         = _mm_movehl_ps(_tmp16,_tmp15);                                                                   \
1375      _tmp23         = _mm_movelh_ps(jz3,jy4);                                                                         \
1376      jy4            = _mm_movehl_ps(jy4,jz3);                                                                         \
1377      _tmp24         = _mm_movelh_ps(_tmp17,_tmp18);                                                                   \
1378      _tmp18         = _mm_movehl_ps(_tmp18,_tmp17);                                                                   \
1379      _tmp1          = _mm_add_ps(_tmp1,_tmp19);                                                                       \
1380      _tmp2          = _mm_add_ps(_tmp2,_tmp21);                                                                       \
1381      _tmp3          = _mm_add_ps(_tmp3,_tmp23);                                                                       \
1382      _tmp4          = _mm_add_ps(_tmp4,jz1);                                                                          \
1383      _tmp5          = _mm_add_ps(_tmp5,jx3);                                                                          \
1384      _tmp6          = _mm_add_ps(_tmp6,jy4);                                                                          \
1385      _tmp7          = _mm_add_ps(_tmp7,_tmp20);                                                                       \
1386      _tmp8          = _mm_add_ps(_tmp8,_tmp22);                                                                       \
1387      _tmp9          = _mm_add_ps(_tmp9,_tmp24);                                                                       \
1388      _tmp10         = _mm_add_ps(_tmp10,_tmp14);                                                                      \
1389      _tmp11         = _mm_add_ps(_tmp11,_tmp16);                                                                      \
1390      _tmp12         = _mm_add_ps(_tmp12,_tmp18);                                                                      \
1391      _mm_storeu_ps(ptr1,_tmp1);                                                                                       \
1392      _mm_storeu_ps(ptr1+4,_tmp2);                                                                                     \
1393      _mm_storeu_ps(ptr1+8,_tmp3);                                                                                     \
1394      _mm_storeu_ps(ptr2,_tmp4);                                                                                       \
1395      _mm_storeu_ps(ptr2+4,_tmp5);                                                                                     \
1396      _mm_storeu_ps(ptr2+8,_tmp6);                                                                                     \
1397      _mm_storeu_ps(ptr3,_tmp7);                                                                                       \
1398      _mm_storeu_ps(ptr3+4,_tmp8);                                                                                     \
1399      _mm_storeu_ps(ptr3+8,_tmp9);                                                                                     \
1400      _mm_storeu_ps(ptr4,_tmp10);                                                                                      \
1401      _mm_storeu_ps(ptr4+4,_tmp11);                                                                                    \
1402      _mm_storeu_ps(ptr4+8,_tmp12);                                                                                    \
1403 }
1404
1405
1406
1407 #define GMX_MM_DECREMENT_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) {     \
1408     __m128 _tmp1;                                                    \
1409     jy1            = _mm_unpacklo_ps(jy1,jz1);                       \
1410     jx1            = _mm_movelh_ps(jx1,jy1);                         \
1411     _tmp1          = _mm_load_ss(ptr1);                              \
1412     _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1));          \
1413     _tmp1          = _mm_sub_ps(_tmp1,jx1);                          \
1414     _mm_store_ss(ptr1,_tmp1);                                        \
1415     _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1);                          \
1416 }
1417
1418
1419 #define GMX_MM_DECREMENT_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
1420     __m128 _tmp1, _tmp2;                                                      \
1421     _tmp1          = _mm_loadu_ps(ptr1);                                      \
1422     _tmp2          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));        \
1423     jx1            = _mm_unpacklo_ps(jx1,jy1);                                \
1424     jz1            = _mm_unpacklo_ps(jz1,jx2);                                \
1425     jy2            = _mm_unpacklo_ps(jy2,jz2);                                \
1426     jx1            = _mm_movelh_ps(jx1,jz1);                                  \
1427     _tmp1          = _mm_sub_ps(_tmp1,jx1);                                   \
1428     _tmp2          = _mm_sub_ps(_tmp2,jy2);                                   \
1429     _mm_storeu_ps(ptr1,_tmp1);                                                \
1430     _mm_storel_pi((__m64 *)(ptr1+4),_tmp2);                                   \
1431 }
1432
1433
1434 #define GMX_MM_DECREMENT_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1435     __m128 _tmp1, _tmp2, _tmp3;                                                           \
1436     _tmp1          = _mm_loadu_ps(ptr1);                                                  \
1437     _tmp2          = _mm_loadu_ps(ptr1+4);                                                \
1438     _tmp3          = _mm_load_ss(ptr1+8);                                                 \
1439     jx1            = _mm_unpacklo_ps(jx1,jy1);                                            \
1440     jz1            = _mm_unpacklo_ps(jz1,jx2);                                            \
1441     jy2            = _mm_unpacklo_ps(jy2,jz2);                                            \
1442     jx3            = _mm_unpacklo_ps(jx3,jy3);                                            \
1443     jx1            = _mm_movelh_ps(jx1,jz1);                                              \
1444     jy2            = _mm_movelh_ps(jy2,jx3);                                              \
1445     _tmp1          = _mm_sub_ps(_tmp1,jx1);                                               \
1446     _tmp2          = _mm_sub_ps(_tmp2,jy2);                                               \
1447     _tmp3          = _mm_sub_ss(_tmp3,jz3);                                               \
1448     _mm_storeu_ps(ptr1,_tmp1);                                                            \
1449     _mm_storeu_ps(ptr1+4,_tmp2);                                                          \
1450     _mm_store_ss(ptr1+8,_tmp3);                                                           \
1451 }
1452
1453
1454 #define GMX_MM_DECREMENT_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1455     __m128 _tmp1, _tmp2, _tmp3;                                                                       \
1456     _tmp1          = _mm_loadu_ps(ptr1);                                                              \
1457     _tmp2          = _mm_loadu_ps(ptr1+4);                                                            \
1458     _tmp3          = _mm_loadu_ps(ptr1+8);                                                            \
1459     jx1            = _mm_unpacklo_ps(jx1,jy1);                                                        \
1460     jz1            = _mm_unpacklo_ps(jz1,jx2);                                                        \
1461     jy2            = _mm_unpacklo_ps(jy2,jz2);                                                        \
1462     jx3            = _mm_unpacklo_ps(jx3,jy3);                                                        \
1463     jz3            = _mm_unpacklo_ps(jz3,jx4);                                                        \
1464     jy4            = _mm_unpacklo_ps(jy4,jz4);                                                        \
1465     jx1            = _mm_movelh_ps(jx1,jz1);                                                          \
1466     jy2            = _mm_movelh_ps(jy2,jx3);                                                          \
1467     jz3            = _mm_movelh_ps(jz3,jy4);                                                          \
1468     _tmp1          = _mm_sub_ps(_tmp1,jx1);                                                           \
1469     _tmp2          = _mm_sub_ps(_tmp2,jy2);                                                           \
1470     _tmp3          = _mm_sub_ps(_tmp3,jz3);                                                           \
1471     _mm_storeu_ps(ptr1,_tmp1);                                                                        \
1472     _mm_storeu_ps(ptr1+4,_tmp2);                                                                      \
1473     _mm_storeu_ps(ptr1+8,_tmp3);                                                                      \
1474 }
1475
1476
1477 #define GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) {        \
1478     __m128 _tmp1,_tmp2,_tmp3,_tmp4;                                           \
1479     _tmp1          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1));          \
1480     _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2));                     \
1481     _tmp2          = _mm_load_ss(ptr1+2);                                     \
1482     _tmp3          = _mm_load_ss(ptr2+2);                                     \
1483     jx1            = _mm_unpacklo_ps(jx1,jy1);                                \
1484     _tmp4          = _mm_shuffle_ps(jz1,jz1,_MM_SHUFFLE(0,0,0,1));            \
1485     _tmp1          = _mm_sub_ps(_tmp1,jx1);                                   \
1486     _mm_storel_pi((__m64 *)(ptr1),_tmp1);                                     \
1487     _mm_storeh_pi((__m64 *)(ptr2),_tmp1);                                     \
1488     _mm_store_ss(ptr1+2,_mm_sub_ss(_tmp2,jz1));                               \
1489     _mm_store_ss(ptr2+2,_mm_sub_ss(_tmp3,_tmp4));                             \
1490 }
1491
1492
1493 #define GMX_MM_DECREMENT_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
1494     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5;                                           \
1495     _tmp1          = _mm_loadu_ps(ptr1);                                            \
1496     _tmp2          = _mm_loadu_ps(ptr2);                                            \
1497     _tmp3          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));              \
1498     _tmp3          = _mm_loadh_pi(_tmp3,(__m64 *)(ptr2+4));                         \
1499     jx1            = _mm_unpacklo_ps(jx1,jy1);                                      \
1500     jz1            = _mm_unpacklo_ps(jz1,jx2);                                      \
1501     jy2            = _mm_unpacklo_ps(jy2,jz2);                                      \
1502     _tmp4          = _mm_movelh_ps(jx1,jz1);                                        \
1503     _tmp5          = _mm_movehl_ps(jz1,jx1);                                        \
1504     _tmp1          = _mm_sub_ps(_tmp1,_tmp4);                                       \
1505     _tmp2          = _mm_sub_ps(_tmp2,_tmp5);                                       \
1506     _tmp3          = _mm_sub_ps(_tmp3,jy2);                                         \
1507     _mm_storeu_ps(ptr1,_tmp1);                                                      \
1508     _mm_storeu_ps(ptr2,_tmp2);                                                      \
1509     _mm_storel_pi((__m64 *)(ptr1+4),_tmp3);                                         \
1510     _mm_storeh_pi((__m64 *)(ptr2+4),_tmp3);                                         \
1511 }
1512
1513
1514 #define GMX_MM_DECREMENT_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) {\
1515     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11;                \
1516     _tmp1          = _mm_loadu_ps(ptr1);                                                       \
1517     _tmp2          = _mm_loadu_ps(ptr1+4);                                                     \
1518     _tmp3          = _mm_load_ss(ptr1+8);                                                      \
1519     _tmp4          = _mm_loadu_ps(ptr2);                                                       \
1520     _tmp5          = _mm_loadu_ps(ptr2+4);                                                     \
1521     _tmp6          = _mm_load_ss(ptr2+8);                                                      \
1522     jx1            = _mm_unpacklo_ps(jx1,jy1);                                                 \
1523     jz1            = _mm_unpacklo_ps(jz1,jx2);                                                 \
1524     jy2            = _mm_unpacklo_ps(jy2,jz2);                                                 \
1525     jx3            = _mm_unpacklo_ps(jx3,jy3);                                                 \
1526     _tmp7          = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1));                             \
1527     _tmp8          = _mm_movelh_ps(jx1,jz1);                                                   \
1528     _tmp9          = _mm_movehl_ps(jz1,jx1);                                                   \
1529     _tmp10         = _mm_movelh_ps(jy2,jx3);                                                   \
1530     _tmp11         = _mm_movehl_ps(jx3,jy2);                                                   \
1531     _tmp1          = _mm_sub_ps(_tmp1,_tmp8);                                                  \
1532     _tmp2          = _mm_sub_ps(_tmp2,_tmp10);                                                 \
1533     _tmp3          = _mm_sub_ss(_tmp3,jz3);                                                    \
1534     _tmp4          = _mm_sub_ps(_tmp4,_tmp9);                                                  \
1535     _tmp5          = _mm_sub_ps(_tmp5,_tmp11);                                                 \
1536     _tmp6          = _mm_sub_ss(_tmp6,_tmp7);                                                  \
1537     _mm_storeu_ps(ptr1,_tmp1);                                                                 \
1538     _mm_storeu_ps(ptr1+4,_tmp2);                                                               \
1539     _mm_store_ss(ptr1+8,_tmp3);                                                                \
1540     _mm_storeu_ps(ptr2,_tmp4);                                                                 \
1541     _mm_storeu_ps(ptr2+4,_tmp5);                                                               \
1542     _mm_store_ss(ptr2+8,_tmp6);                                                                \
1543 }
1544
1545
1546 #define GMX_MM_DECREMENT_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) {\
1547     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13;              \
1548     _tmp1          = _mm_loadu_ps(ptr1);                                                                   \
1549     _tmp2          = _mm_loadu_ps(ptr1+4);                                                                 \
1550     _tmp3          = _mm_loadu_ps(ptr1+8);                                                                 \
1551     _tmp4          = _mm_loadu_ps(ptr2);                                                                   \
1552     _tmp5          = _mm_loadu_ps(ptr2+4);                                                                 \
1553     _tmp6          = _mm_loadu_ps(ptr2+8);                                                                 \
1554     jx1            = _mm_unpacklo_ps(jx1,jy1);                                                             \
1555     jz1            = _mm_unpacklo_ps(jz1,jx2);                                                             \
1556     jy2            = _mm_unpacklo_ps(jy2,jz2);                                                             \
1557     jx3            = _mm_unpacklo_ps(jx3,jy3);                                                             \
1558     jz3            = _mm_unpacklo_ps(jz3,jx4);                                                             \
1559     jy4            = _mm_unpacklo_ps(jy4,jz4);                                                             \
1560     _tmp8          = _mm_movelh_ps(jx1,jz1);                                                               \
1561     _tmp9          = _mm_movehl_ps(jz1,jx1);                                                               \
1562     _tmp10         = _mm_movelh_ps(jy2,jx3);                                                               \
1563     _tmp11         = _mm_movehl_ps(jx3,jy2);                                                               \
1564     _tmp12         = _mm_movelh_ps(jz3,jy4);                                                               \
1565     _tmp13         = _mm_movehl_ps(jy4,jz3);                                                               \
1566     _tmp1          = _mm_sub_ps(_tmp1,_tmp8);                                                              \
1567     _tmp2          = _mm_sub_ps(_tmp2,_tmp10);                                                             \
1568     _tmp3          = _mm_sub_ps(_tmp3,_tmp12);                                                             \
1569     _tmp4          = _mm_sub_ps(_tmp4,_tmp9);                                                              \
1570     _tmp5          = _mm_sub_ps(_tmp5,_tmp11);                                                             \
1571     _tmp6          = _mm_sub_ps(_tmp6,_tmp13);                                                             \
1572     _mm_storeu_ps(ptr1,_tmp1);                                                                             \
1573     _mm_storeu_ps(ptr1+4,_tmp2);                                                                           \
1574     _mm_storeu_ps(ptr1+8,_tmp3);                                                                           \
1575     _mm_storeu_ps(ptr2,_tmp4);                                                                             \
1576     _mm_storeu_ps(ptr2+4,_tmp5);                                                                           \
1577     _mm_storeu_ps(ptr2+8,_tmp6);                                                                           \
1578 }
1579
1580
1581 #define GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
1582     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7;                       \
1583     _tmp1          = _mm_load_ss(ptr1);                                     \
1584     _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1));                 \
1585     _tmp2          = _mm_load_ss(ptr2);                                     \
1586     _tmp2          = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1));                 \
1587     _tmp3          = _mm_load_ss(ptr3);                                     \
1588     _tmp3          = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1));                 \
1589     _tmp4          = _mm_unpacklo_ps(jy1,jz1);                              \
1590     _tmp5          = _mm_unpackhi_ps(jy1,jz1);                              \
1591     _tmp6          = _mm_shuffle_ps(jx1,_tmp4,_MM_SHUFFLE(3,2,0,1));        \
1592     _tmp7          = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2));          \
1593     jx1            = _mm_movelh_ps(jx1,_tmp4);                              \
1594     _tmp7          = _mm_movelh_ps(_tmp7,_tmp5);                            \
1595     _tmp1          = _mm_sub_ps(_tmp1,jx1);                                 \
1596     _tmp2          = _mm_sub_ps(_tmp2,_tmp6);                               \
1597     _tmp3          = _mm_sub_ps(_tmp3,_tmp7);                               \
1598     _mm_store_ss(ptr1,_tmp1);                                               \
1599     _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1);                                 \
1600     _mm_store_ss(ptr2,_tmp2);                                               \
1601     _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2);                                 \
1602     _mm_store_ss(ptr3,_tmp3);                                               \
1603     _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3);                                 \
1604 }
1605
1606
1607 #define GMX_MM_DECREMENT_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
1608     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10;                 \
1609     _tmp1          = _mm_loadu_ps(ptr1);                                                 \
1610     _tmp2          = _mm_loadu_ps(ptr2);                                                 \
1611     _tmp3          = _mm_loadu_ps(ptr3);                                                 \
1612     _tmp4          = _mm_loadl_pi(_tmp4,(__m64 *)(ptr1+4));                              \
1613     _tmp4          = _mm_loadh_pi(_tmp4,(__m64 *)(ptr2+4));                              \
1614     _tmp5          = _mm_loadl_pi(_tmp5,(__m64 *)(ptr3+4));                              \
1615     _tmp6          = _mm_unpackhi_ps(jx1,jy1);                                           \
1616     jx1            = _mm_unpacklo_ps(jx1,jy1);                                           \
1617     _tmp7          = _mm_unpackhi_ps(jz1,jx2);                                           \
1618     jz1            = _mm_unpacklo_ps(jz1,jx2);                                           \
1619     _tmp8          = _mm_unpackhi_ps(jy2,jz2);                                           \
1620     jy2            = _mm_unpacklo_ps(jy2,jz2);                                           \
1621     _tmp9          = _mm_movelh_ps(jx1,jz1);                                             \
1622     _tmp10         = _mm_movehl_ps(jz1,jx1);                                             \
1623     _tmp6          = _mm_movelh_ps(_tmp6,_tmp7);                                         \
1624     _tmp1          = _mm_sub_ps(_tmp1,_tmp9);                                            \
1625     _tmp2          = _mm_sub_ps(_tmp2,_tmp10);                                           \
1626     _tmp3          = _mm_sub_ps(_tmp3,_tmp6);                                            \
1627     _tmp4          = _mm_sub_ps(_tmp4,jy2);                                              \
1628     _tmp5          = _mm_sub_ps(_tmp5,_tmp8);                                            \
1629     _mm_storeu_ps(ptr1,_tmp1);                                                           \
1630     _mm_storeu_ps(ptr2,_tmp2);                                                           \
1631     _mm_storeu_ps(ptr3,_tmp3);                                                           \
1632     _mm_storel_pi((__m64 *)(ptr1+4),_tmp4);                                              \
1633     _mm_storeh_pi((__m64 *)(ptr2+4),_tmp4);                                              \
1634     _mm_storel_pi((__m64 *)(ptr3+4),_tmp5);                                              \
1635 }
1636
1637
1638 #define GMX_MM_DECREMENT_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1639     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10;      \
1640     __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19;    \
1641     _tmp1          = _mm_loadu_ps(ptr1);                                      \
1642     _tmp2          = _mm_loadu_ps(ptr1+4);                                    \
1643     _tmp3          = _mm_load_ss(ptr1+8);                                     \
1644     _tmp4          = _mm_loadu_ps(ptr2);                                      \
1645     _tmp5          = _mm_loadu_ps(ptr2+4);                                    \
1646     _tmp6          = _mm_load_ss(ptr2+8);                                     \
1647     _tmp7          = _mm_loadu_ps(ptr3);                                      \
1648     _tmp8          = _mm_loadu_ps(ptr3+4);                                    \
1649     _tmp9          = _mm_load_ss(ptr3+8);                                     \
1650     _tmp10         = _mm_unpackhi_ps(jx1,jy1);                                \
1651     jx1            = _mm_unpacklo_ps(jx1,jy1);                                \
1652     _tmp11         = _mm_unpackhi_ps(jz1,jx2);                                \
1653     jz1            = _mm_unpacklo_ps(jz1,jx2);                                \
1654     _tmp12         = _mm_unpackhi_ps(jy2,jz2);                                \
1655     jy2            = _mm_unpacklo_ps(jy2,jz2);                                \
1656     _tmp13         = _mm_unpackhi_ps(jx3,jy3);                                \
1657     jx3            = _mm_unpacklo_ps(jx3,jy3);                                \
1658     _tmp14         = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1));            \
1659     _tmp15         = _mm_movehl_ps(jz3,jz3);                                  \
1660     _tmp16         = _mm_movelh_ps(jx1,jz1);                                  \
1661     _tmp17         = _mm_movehl_ps(jz1,jx1);                                  \
1662     _tmp10         = _mm_movelh_ps(_tmp10,_tmp11);                            \
1663     _tmp18         = _mm_movelh_ps(jy2,jx3);                                  \
1664     _tmp19         = _mm_movehl_ps(jx3,jy2);                                  \
1665     _tmp12         = _mm_movelh_ps(_tmp12,_tmp13);                            \
1666     _tmp1          = _mm_sub_ps(_tmp1,_tmp16);                                \
1667     _tmp2          = _mm_sub_ps(_tmp2,_tmp18);                                \
1668     _tmp3          = _mm_sub_ss(_tmp3,jz3);                                   \
1669     _tmp4          = _mm_sub_ps(_tmp4,_tmp17);                                \
1670     _tmp5          = _mm_sub_ps(_tmp5,_tmp19);                                \
1671     _tmp6          = _mm_sub_ss(_tmp6,_tmp14);                                \
1672     _tmp7          = _mm_sub_ps(_tmp7,_tmp10);                                \
1673     _tmp8          = _mm_sub_ps(_tmp8,_tmp12);                                \
1674     _tmp9          = _mm_sub_ss(_tmp9,_tmp15);                                \
1675     _mm_storeu_ps(ptr1,_tmp1);                                                \
1676     _mm_storeu_ps(ptr1+4,_tmp2);                                              \
1677     _mm_store_ss(ptr1+8,_tmp3);                                               \
1678     _mm_storeu_ps(ptr2,_tmp4);                                                \
1679     _mm_storeu_ps(ptr2+4,_tmp5);                                              \
1680     _mm_store_ss(ptr2+8,_tmp6);                                               \
1681     _mm_storeu_ps(ptr3,_tmp7);                                                \
1682     _mm_storeu_ps(ptr3+4,_tmp8);                                              \
1683     _mm_store_ss(ptr3+8,_tmp9);                                               \
1684 }
1685
1686
1687 #define GMX_MM_DECREMENT_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1688     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11;                                  \
1689     __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21;                                \
1690     _tmp1          = _mm_loadu_ps(ptr1);                                      \
1691     _tmp2          = _mm_loadu_ps(ptr1+4);                                    \
1692     _tmp3          = _mm_loadu_ps(ptr1+8);                                    \
1693     _tmp4          = _mm_loadu_ps(ptr2);                                      \
1694     _tmp5          = _mm_loadu_ps(ptr2+4);                                    \
1695     _tmp6          = _mm_loadu_ps(ptr2+8);                                    \
1696     _tmp7          = _mm_loadu_ps(ptr3);                                      \
1697     _tmp8          = _mm_loadu_ps(ptr3+4);                                    \
1698     _tmp9          = _mm_loadu_ps(ptr3+8);                                    \
1699     _tmp10         = _mm_unpackhi_ps(jx1,jy1);                                \
1700     jx1            = _mm_unpacklo_ps(jx1,jy1);                                \
1701     _tmp11         = _mm_unpackhi_ps(jz1,jx2);                                \
1702     jz1            = _mm_unpacklo_ps(jz1,jx2);                                \
1703     _tmp12         = _mm_unpackhi_ps(jy2,jz2);                                \
1704     jy2            = _mm_unpacklo_ps(jy2,jz2);                                \
1705     _tmp13         = _mm_unpackhi_ps(jx3,jy3);                                \
1706     jx3            = _mm_unpacklo_ps(jx3,jy3);                                \
1707     _tmp14         = _mm_unpackhi_ps(jz3,jx4);                                \
1708     jz3            = _mm_unpacklo_ps(jz3,jx4);                                \
1709     _tmp15         = _mm_unpackhi_ps(jy4,jz4);                                \
1710     jy4            = _mm_unpacklo_ps(jy4,jz4);                                \
1711     _tmp16         = _mm_movelh_ps(jx1,jz1);                                  \
1712     _tmp17         = _mm_movehl_ps(jz1,jx1);                                  \
1713     _tmp10         = _mm_movelh_ps(_tmp10,_tmp11);                            \
1714     _tmp18         = _mm_movelh_ps(jy2,jx3);                                  \
1715     _tmp19         = _mm_movehl_ps(jx3,jy2);                                  \
1716     _tmp12         = _mm_movelh_ps(_tmp12,_tmp13);                            \
1717     _tmp20         = _mm_movelh_ps(jz3,jy4);                                  \
1718     _tmp21         = _mm_movehl_ps(jy4,jz3);                                  \
1719     _tmp14         = _mm_movelh_ps(_tmp14,_tmp15);                            \
1720     _tmp1          = _mm_sub_ps(_tmp1,_tmp16);                                \
1721     _tmp2          = _mm_sub_ps(_tmp2,_tmp18);                                \
1722     _tmp3          = _mm_sub_ps(_tmp3,_tmp20);                                \
1723     _tmp4          = _mm_sub_ps(_tmp4,_tmp17);                                \
1724     _tmp5          = _mm_sub_ps(_tmp5,_tmp19);                                \
1725     _tmp6          = _mm_sub_ps(_tmp6,_tmp21);                                \
1726     _tmp7          = _mm_sub_ps(_tmp7,_tmp10);                                \
1727     _tmp8          = _mm_sub_ps(_tmp8,_tmp12);                                \
1728     _tmp9          = _mm_sub_ps(_tmp9,_tmp14);                                \
1729     _mm_storeu_ps(ptr1,_tmp1);                                                \
1730     _mm_storeu_ps(ptr1+4,_tmp2);                                              \
1731     _mm_storeu_ps(ptr1+8,_tmp3);                                              \
1732     _mm_storeu_ps(ptr2,_tmp4);                                                \
1733     _mm_storeu_ps(ptr2+4,_tmp5);                                              \
1734     _mm_storeu_ps(ptr2+8,_tmp6);                                              \
1735     _mm_storeu_ps(ptr3,_tmp7);                                                \
1736     _mm_storeu_ps(ptr3+4,_tmp8);                                              \
1737     _mm_storeu_ps(ptr3+8,_tmp9);                                              \
1738 }
1739
1740
1741
1742
1743 #define GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
1744     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10;         \
1745     _tmp1          = _mm_load_ss(ptr1);                              \
1746     _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1));          \
1747     _tmp2          = _mm_load_ss(ptr2);                              \
1748     _tmp2          = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1));          \
1749     _tmp3          = _mm_load_ss(ptr3);                              \
1750     _tmp3          = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1));          \
1751     _tmp4          = _mm_load_ss(ptr4);                              \
1752     _tmp4          = _mm_loadh_pi(_tmp4,(__m64 *)(ptr4+1));          \
1753     _tmp5          = _mm_unpacklo_ps(jy1,jz1);                       \
1754     _tmp6          = _mm_unpackhi_ps(jy1,jz1);                       \
1755     _tmp7          = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(1,0,0,0)); \
1756     _tmp8          = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(3,2,0,1)); \
1757     _tmp9          = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(1,0,0,2)); \
1758     _tmp10         = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(3,2,0,3)); \
1759     _tmp1          = _mm_sub_ps(_tmp1,_tmp7);                        \
1760     _tmp2          = _mm_sub_ps(_tmp2,_tmp8);                        \
1761     _tmp3          = _mm_sub_ps(_tmp3,_tmp9);                        \
1762     _tmp4          = _mm_sub_ps(_tmp4,_tmp10);                       \
1763     _mm_store_ss(ptr1,_tmp1);                                        \
1764     _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1);                          \
1765     _mm_store_ss(ptr2,_tmp2);                                        \
1766     _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2);                          \
1767     _mm_store_ss(ptr3,_tmp3);                                        \
1768     _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3);                          \
1769     _mm_store_ss(ptr4,_tmp4);                                        \
1770     _mm_storeh_pi((__m64 *)(ptr4+1),_tmp4);                          \
1771 }
1772
1773
1774
1775 #define GMX_MM_DECREMENT_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
1776     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1777     _tmp1          = _mm_loadu_ps(ptr1);                                       \
1778     _tmp2          = _mm_loadu_ps(ptr2);                                       \
1779     _tmp3          = _mm_loadu_ps(ptr3);                                       \
1780     _tmp4          = _mm_loadu_ps(ptr4);                                       \
1781     _tmp5          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));         \
1782     _tmp5          = _mm_loadh_pi(_tmp5,(__m64 *)(ptr2+4));                    \
1783     _tmp6          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4));         \
1784     _tmp6          = _mm_loadh_pi(_tmp6,(__m64 *)(ptr4+4));                    \
1785     _tmp7          = _mm_unpackhi_ps(jx1,jy1);                                 \
1786     jx1            = _mm_unpacklo_ps(jx1,jy1);                                 \
1787     _tmp8          = _mm_unpackhi_ps(jz1,jx2);                                 \
1788     jz1            = _mm_unpacklo_ps(jz1,jx2);                                 \
1789     _tmp9          = _mm_unpackhi_ps(jy2,jz2);                                 \
1790     jy2            = _mm_unpacklo_ps(jy2,jz2);                                 \
1791     _tmp10         = _mm_movelh_ps(jx1,jz1);                                   \
1792     _tmp11         = _mm_movehl_ps(jz1,jx1);                                   \
1793     _tmp12         = _mm_movelh_ps(_tmp7,_tmp8);                               \
1794     _tmp13         = _mm_movehl_ps(_tmp8,_tmp7);                               \
1795     _tmp1          = _mm_sub_ps(_tmp1,_tmp10);                                 \
1796     _tmp2          = _mm_sub_ps(_tmp2,_tmp11);                                 \
1797     _tmp3          = _mm_sub_ps(_tmp3,_tmp12);                                 \
1798     _tmp4          = _mm_sub_ps(_tmp4,_tmp13);                                 \
1799     _tmp5          = _mm_sub_ps(_tmp5,jy2);                                    \
1800     _tmp6          = _mm_sub_ps(_tmp6,_tmp9);                                  \
1801     _mm_storeu_ps(ptr1,_tmp1);                                                 \
1802     _mm_storeu_ps(ptr2,_tmp2);                                                 \
1803     _mm_storeu_ps(ptr3,_tmp3);                                                 \
1804     _mm_storeu_ps(ptr4,_tmp4);                                                 \
1805     _mm_storel_pi((__m64 *)(ptr1+4),_tmp5);                                    \
1806     _mm_storeh_pi((__m64 *)(ptr2+4),_tmp5);                                    \
1807     _mm_storel_pi((__m64 *)(ptr3+4),_tmp6);                                    \
1808     _mm_storeh_pi((__m64 *)(ptr4+4),_tmp6);                                    \
1809 }
1810
1811
1812 #define GMX_MM_DECREMENT_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1813     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10;       \
1814     __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19;     \
1815     __m128 _tmp20,_tmp21,_tmp22,_tmp23,_tmp24,_tmp25;                          \
1816     _tmp1          = _mm_loadu_ps(ptr1);                                       \
1817     _tmp2          = _mm_loadu_ps(ptr1+4);                                     \
1818     _tmp3          = _mm_load_ss(ptr1+8);                                      \
1819     _tmp4          = _mm_loadu_ps(ptr2);                                       \
1820     _tmp5          = _mm_loadu_ps(ptr2+4);                                     \
1821     _tmp6          = _mm_load_ss(ptr2+8);                                      \
1822     _tmp7          = _mm_loadu_ps(ptr3);                                       \
1823     _tmp8          = _mm_loadu_ps(ptr3+4);                                     \
1824     _tmp9          = _mm_load_ss(ptr3+8);                                      \
1825     _tmp10         = _mm_loadu_ps(ptr4);                                       \
1826     _tmp11         = _mm_loadu_ps(ptr4+4);                                     \
1827     _tmp12         = _mm_load_ss(ptr4+8);                                      \
1828     _tmp13         = _mm_unpackhi_ps(jx1,jy1);                                 \
1829     jx1            = _mm_unpacklo_ps(jx1,jy1);                                 \
1830     _tmp14         = _mm_unpackhi_ps(jz1,jx2);                                 \
1831     jz1            = _mm_unpacklo_ps(jz1,jx2);                                 \
1832     _tmp15         = _mm_unpackhi_ps(jy2,jz2);                                 \
1833     jy2            = _mm_unpacklo_ps(jy2,jz2);                                 \
1834     _tmp16         = _mm_unpackhi_ps(jx3,jy3);                                 \
1835     jx3            = _mm_unpacklo_ps(jx3,jy3);                                 \
1836     _tmp17         = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1));             \
1837     _tmp18         = _mm_movehl_ps(jz3,jz3);                                   \
1838     _tmp19         = _mm_shuffle_ps(_tmp18,_tmp18,_MM_SHUFFLE(0,0,0,1));       \
1839     _tmp20         = _mm_movelh_ps(jx1,jz1);                                   \
1840     _tmp21         = _mm_movehl_ps(jz1,jx1);                                   \
1841     _tmp22         = _mm_movelh_ps(_tmp13,_tmp14);                             \
1842     _tmp14         = _mm_movehl_ps(_tmp14,_tmp13);                             \
1843     _tmp23         = _mm_movelh_ps(jy2,jx3);                                   \
1844     _tmp24         = _mm_movehl_ps(jx3,jy2);                                   \
1845     _tmp25         = _mm_movelh_ps(_tmp15,_tmp16);                             \
1846     _tmp16         = _mm_movehl_ps(_tmp16,_tmp15);                             \
1847     _tmp1          = _mm_sub_ps(_tmp1,_tmp20);                                 \
1848     _tmp2          = _mm_sub_ps(_tmp2,_tmp23);                                 \
1849     _tmp3          = _mm_sub_ss(_tmp3,jz3);                                    \
1850     _tmp4          = _mm_sub_ps(_tmp4,_tmp21);                                 \
1851     _tmp5          = _mm_sub_ps(_tmp5,_tmp24);                                 \
1852     _tmp6          = _mm_sub_ss(_tmp6,_tmp17);                                 \
1853     _tmp7          = _mm_sub_ps(_tmp7,_tmp22);                                 \
1854     _tmp8          = _mm_sub_ps(_tmp8,_tmp25);                                 \
1855     _tmp9          = _mm_sub_ss(_tmp9,_tmp18);                                 \
1856     _tmp10         = _mm_sub_ps(_tmp10,_tmp14);                                \
1857     _tmp11         = _mm_sub_ps(_tmp11,_tmp16);                                \
1858     _tmp12         = _mm_sub_ss(_tmp12,_tmp19);                                \
1859     _mm_storeu_ps(ptr1,_tmp1);                                                 \
1860     _mm_storeu_ps(ptr1+4,_tmp2);                                               \
1861     _mm_store_ss(ptr1+8,_tmp3);                                                \
1862     _mm_storeu_ps(ptr2,_tmp4);                                                 \
1863     _mm_storeu_ps(ptr2+4,_tmp5);                                               \
1864     _mm_store_ss(ptr2+8,_tmp6);                                                \
1865     _mm_storeu_ps(ptr3,_tmp7);                                                 \
1866     _mm_storeu_ps(ptr3+4,_tmp8);                                               \
1867     _mm_store_ss(ptr3+8,_tmp9);                                                \
1868     _mm_storeu_ps(ptr4,_tmp10);                                                \
1869     _mm_storeu_ps(ptr4+4,_tmp11);                                              \
1870     _mm_store_ss(ptr4+8,_tmp12);                                               \
1871 }
1872
1873
1874 #define GMX_MM_DECREMENT_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1875     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11;         \
1876     __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21,_tmp22;\
1877     __m128 _tmp23,_tmp24;                                                     \
1878     _tmp1          = _mm_loadu_ps(ptr1);                                      \
1879     _tmp2          = _mm_loadu_ps(ptr1+4);                                    \
1880     _tmp3          = _mm_loadu_ps(ptr1+8);                                    \
1881     _tmp4          = _mm_loadu_ps(ptr2);                                      \
1882     _tmp5          = _mm_loadu_ps(ptr2+4);                                    \
1883     _tmp6          = _mm_loadu_ps(ptr2+8);                                    \
1884     _tmp7          = _mm_loadu_ps(ptr3);                                      \
1885     _tmp8          = _mm_loadu_ps(ptr3+4);                                    \
1886     _tmp9          = _mm_loadu_ps(ptr3+8);                                    \
1887     _tmp10         = _mm_loadu_ps(ptr4);                                      \
1888     _tmp11         = _mm_loadu_ps(ptr4+4);                                    \
1889     _tmp12         = _mm_loadu_ps(ptr4+8);                                    \
1890     _tmp13         = _mm_unpackhi_ps(jx1,jy1);                                \
1891     jx1            = _mm_unpacklo_ps(jx1,jy1);                                \
1892     _tmp14         = _mm_unpackhi_ps(jz1,jx2);                                \
1893     jz1            = _mm_unpacklo_ps(jz1,jx2);                                \
1894     _tmp15         = _mm_unpackhi_ps(jy2,jz2);                                \
1895     jy2            = _mm_unpacklo_ps(jy2,jz2);                                \
1896     _tmp16         = _mm_unpackhi_ps(jx3,jy3);                                \
1897     jx3            = _mm_unpacklo_ps(jx3,jy3);                                \
1898     _tmp17         = _mm_unpackhi_ps(jz3,jx4);                                \
1899     jz3            = _mm_unpacklo_ps(jz3,jx4);                                \
1900     _tmp18         = _mm_unpackhi_ps(jy4,jz4);                                \
1901     jy4            = _mm_unpacklo_ps(jy4,jz4);                                \
1902     _tmp19         = _mm_movelh_ps(jx1,jz1);                                  \
1903     jz1            = _mm_movehl_ps(jz1,jx1);                                  \
1904     _tmp20         = _mm_movelh_ps(_tmp13,_tmp14);                            \
1905     _tmp14         = _mm_movehl_ps(_tmp14,_tmp13);                            \
1906     _tmp21         = _mm_movelh_ps(jy2,jx3);                                  \
1907     jx3            = _mm_movehl_ps(jx3,jy2);                                  \
1908     _tmp22         = _mm_movelh_ps(_tmp15,_tmp16);                            \
1909     _tmp16         = _mm_movehl_ps(_tmp16,_tmp15);                            \
1910     _tmp23         = _mm_movelh_ps(jz3,jy4);                                  \
1911     jy4            = _mm_movehl_ps(jy4,jz3);                                  \
1912     _tmp24         = _mm_movelh_ps(_tmp17,_tmp18);                            \
1913     _tmp18         = _mm_movehl_ps(_tmp18,_tmp17);                            \
1914     _tmp1          = _mm_sub_ps(_tmp1,_tmp19);                                \
1915     _tmp2          = _mm_sub_ps(_tmp2,_tmp21);                                \
1916     _tmp3          = _mm_sub_ps(_tmp3,_tmp23);                                \
1917     _tmp4          = _mm_sub_ps(_tmp4,jz1);                                   \
1918     _tmp5          = _mm_sub_ps(_tmp5,jx3);                                   \
1919     _tmp6          = _mm_sub_ps(_tmp6,jy4);                                   \
1920     _tmp7          = _mm_sub_ps(_tmp7,_tmp20);                                \
1921     _tmp8          = _mm_sub_ps(_tmp8,_tmp22);                                \
1922     _tmp9          = _mm_sub_ps(_tmp9,_tmp24);                                \
1923     _tmp10         = _mm_sub_ps(_tmp10,_tmp14);                               \
1924     _tmp11         = _mm_sub_ps(_tmp11,_tmp16);                               \
1925     _tmp12         = _mm_sub_ps(_tmp12,_tmp18);                               \
1926     _mm_storeu_ps(ptr1,_tmp1);                                                \
1927     _mm_storeu_ps(ptr1+4,_tmp2);                                              \
1928     _mm_storeu_ps(ptr1+8,_tmp3);                                              \
1929     _mm_storeu_ps(ptr2,_tmp4);                                                \
1930     _mm_storeu_ps(ptr2+4,_tmp5);                                              \
1931     _mm_storeu_ps(ptr2+8,_tmp6);                                              \
1932     _mm_storeu_ps(ptr3,_tmp7);                                                \
1933     _mm_storeu_ps(ptr3+4,_tmp8);                                              \
1934     _mm_storeu_ps(ptr3+8,_tmp9);                                              \
1935     _mm_storeu_ps(ptr4,_tmp10);                                               \
1936     _mm_storeu_ps(ptr4+4,_tmp11);                                             \
1937     _mm_storeu_ps(ptr4+8,_tmp12);                                             \
1938 }
1939
1940
1941
1942
1943
1944
1945 /* Routine to be called with rswitch/rcut at the beginning of a kernel
1946  * to set up the 7 constants used for analytic 5th order switch calculations.
1947  */
1948 #define GMX_MM_SETUP_SWITCH5_PS(rswitch,rcut,switch_C3,switch_C4,switch_C5,switch_D2,switch_D3,switch_D4) {  \
1949         const __m128  _swsetup_cm6  = { -6.0, -6.0, -6.0, -6.0};                                                 \
1950         const __m128 _swsetup_cm10  = {-10.0,-10.0,-10.0,-10.0};                                                 \
1951         const __m128  _swsetup_c15  = { 15.0, 15.0, 15.0, 15.0};                                                 \
1952         const __m128 _swsetup_cm30  = {-30.0,-30.0,-30.0,-30.0};                                                 \
1953         const __m128  _swsetup_c60  = { 60.0, 60.0, 60.0, 60.0};                                                 \
1954                                                                                                              \
1955         __m128 d,dinv,dinv2,dinv3,dinv4,dinv5;                                                                   \
1956                                                                                                                  \
1957         d       = _mm_sub_ps(rcut,rswitch);                                                                      \
1958         dinv    = gmx_mm_inv_ps(d);                                                                              \
1959         dinv2   = _mm_mul_ps(dinv,dinv);                                                                         \
1960         dinv3   = _mm_mul_ps(dinv2,dinv);                                                                        \
1961         dinv4   = _mm_mul_ps(dinv2,dinv2);                                                                       \
1962         dinv5   = _mm_mul_ps(dinv3,dinv2);                                                                       \
1963                                                                                                                  \
1964         switch_C3 = _mm_mul_ps(_swsetup_cm10,dinv3);                                                             \
1965         switch_C4 = _mm_mul_ps(_swsetup_c15,dinv4);                                                              \
1966         switch_C5 = _mm_mul_ps(_swsetup_cm6,dinv5);                                                              \
1967         switch_D2 = _mm_mul_ps(_swsetup_cm30,dinv3);                                                             \
1968         switch_D3 = _mm_mul_ps(_swsetup_c60,dinv4);                                                              \
1969         switch_D4 = _mm_mul_ps(_swsetup_cm30,dinv5);                                                             \
1970 }
1971
1972
1973 #define GMX_MM_EVALUATE_SWITCH5_PS(r,rswitch,rcut,sw,dsw,sw_C3,sw_C4,sw_C5,sw_D2,sw_D3,sw_D4) { \
1974     const __m128  _sw_one  = {  1.0,  1.0,  1.0,  1.0};                                         \
1975     __m128 d,d2;                                                                                \
1976     d     = _mm_max_ps(r,rswitch);                                                              \
1977     d     = _mm_min_ps(d,rcut);                                                                 \
1978     d     = _mm_sub_ps(d,rswitch);                                                              \
1979     d2    = _mm_mul_ps(d,d);                                                                    \
1980     sw    = _mm_mul_ps(d,sw_C5);                                                                \
1981     dsw   = _mm_mul_ps(d,sw_D4);                                                                \
1982     sw    = _mm_add_ps(sw,sw_C4);                                                               \
1983     dsw   = _mm_add_ps(dsw,sw_D3);                                                              \
1984     sw    = _mm_mul_ps(sw,d);                                                                   \
1985     dsw   = _mm_mul_ps(dsw,d);                                                                  \
1986     sw    = _mm_add_ps(sw,sw_C3);                                                               \
1987     dsw   = _mm_add_ps(dsw,sw_D2);                                                              \
1988     sw    = _mm_mul_ps(sw,_mm_mul_ps(d,d2));                                                    \
1989     dsw   = _mm_mul_ps(dsw,d2);                                                                 \
1990     sw    = _mm_add_ps(sw,_sw_one);                                                             \
1991 }
1992
1993
1994 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
1995 static inline __m128
1996 gmx_mm_interaction_coulomb_ps(__m128 rinv, __m128 qq,__m128 *vctot)
1997 {
1998         __m128 vcoul = _mm_mul_ps(qq,rinv);
1999         *vctot   = _mm_add_ps(*vctot,vcoul);
2000         return vcoul;
2001 }
2002
2003
2004 static inline void
2005 gmx_mm_interaction_coulomb_noforce_ps(__m128 rinv, __m128 qq,__m128 *vctot)
2006 {
2007         __m128 vcoul = _mm_mul_ps(qq,rinv);
2008         *vctot   = _mm_add_ps(*vctot,vcoul);
2009         return;
2010 }
2011
2012 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
2013 static inline __m128
2014 gmx_mm_interaction_coulombrf_ps(const __m128 rinv, const __m128 rsq, const __m128 krf, const __m128 crf, const __m128 qq,__m128 *vctot)
2015 {
2016         const __m128 two  = {2.0,2.0,2.0,2.0};
2017         __m128 vcoul,krsq;
2018
2019         krsq   = _mm_mul_ps(krf,rsq);
2020         vcoul  = _mm_mul_ps(qq, _mm_sub_ps(_mm_add_ps(rinv,krsq),crf));
2021         *vctot = _mm_add_ps(*vctot,vcoul);
2022
2023         return _mm_mul_ps(qq, _mm_sub_ps(rinv, _mm_mul_ps(two,krsq)));
2024 }
2025
2026
2027 static inline void
2028 gmx_mm_interaction_coulombrf_noforce_ps(__m128 rinv, __m128 rsq, __m128 krf, __m128 crf, __m128 qq,__m128 *vctot)
2029 {
2030         __m128 vcoul,krsq;
2031
2032         krsq   = _mm_mul_ps(krf,rsq);
2033         vcoul  = _mm_mul_ps(qq, _mm_sub_ps(_mm_add_ps(rinv,krsq),crf));
2034         *vctot   = _mm_add_ps(*vctot,vcoul);
2035         return;
2036 }
2037
2038
2039 /* GB */
2040
2041
2042
2043
2044 /* GB + RF */
2045
2046
2047 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
2048 static inline __m128
2049 gmx_mm_int_lj_ps(__m128 rinvsq, __m128 c6, __m128 c12, __m128 *vvdwtot)
2050 {
2051         const __m128 six    = {6.0,6.0,6.0,6.0};
2052         const __m128 twelve = {12.0,12.0,12.0,12.0};
2053
2054         __m128 rinvsix,vvdw6,vvdw12;
2055
2056         rinvsix  = _mm_mul_ps(_mm_mul_ps(rinvsq,rinvsq),rinvsq);
2057         vvdw6    = _mm_mul_ps(c6,rinvsix);
2058         vvdw12   = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
2059         *vvdwtot = _mm_add_ps(*vvdwtot , _mm_sub_ps(vvdw12,vvdw6));
2060
2061         return _mm_sub_ps( _mm_mul_ps(twelve,vvdw12),_mm_mul_ps(six,vvdw6));
2062 }
2063
2064
2065 static inline void
2066 gmx_mm_int_lj_potonly_ps(__m128 rinvsq, __m128 c6, __m128 c12, __m128 *vvdwtot)
2067 {
2068         __m128 rinvsix,vvdw6,vvdw12;
2069
2070         rinvsix  = _mm_mul_ps(_mm_mul_ps(rinvsq,rinvsq),rinvsq);
2071         vvdw6    = _mm_mul_ps(c6,rinvsix);
2072         vvdw12   = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
2073         *vvdwtot = _mm_add_ps(*vvdwtot , _mm_sub_ps(vvdw12,vvdw6));
2074
2075         return;
2076 }
2077
2078
2079
2080 /* Return force should be multiplied by -rinv to get fscal */
2081 static inline __m128
2082 gmx_mm_int_4_table_coulomb_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 *vctot)
2083 {
2084     __m128  rt,eps,eps2,Y,F,G,H,vcoul;
2085         __m128i n0;
2086         int     n_a,n_b,n_c,n_d;
2087
2088     rt       = _mm_mul_ps(r,tabscale);
2089         n0       = _mm_cvttps_epi32(rt);
2090         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2091         eps2     = _mm_mul_ps(eps,eps);
2092
2093         /* Extract indices from n0 */
2094         n_a      = gmx_mm_extract_epi32(n0,0);
2095         n_b      = gmx_mm_extract_epi32(n0,1);
2096         n_c      = gmx_mm_extract_epi32(n0,2);
2097         n_d      = gmx_mm_extract_epi32(n0,3);
2098         Y        = _mm_load_ps(VFtab + 4* n_a);
2099         F        = _mm_load_ps(VFtab + 4* n_b);
2100         G        = _mm_load_ps(VFtab + 4* n_c);
2101         H        = _mm_load_ps(VFtab + 4* n_d);
2102         _MM_TRANSPOSE4_PS(Y,F,G,H);
2103         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
2104         G        = _mm_mul_ps(G,eps);               /* Geps  */
2105         F        = _mm_add_ps(F, _mm_add_ps(G,H));  /* Fp    */
2106         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Y, _mm_mul_ps(eps,F)));
2107         *vctot   = _mm_add_ps(*vctot,vcoul);
2108
2109         F        = _mm_mul_ps(qq, _mm_add_ps(F, _mm_add_ps(G, _mm_add_ps(H,H))));
2110
2111         return _mm_mul_ps(F,tabscale);
2112 }
2113
2114
2115
2116 /* Return force should be multiplied by -rinv to get fscal */
2117 static inline __m128
2118 gmx_mm_int_4_table_lj_ps(__m128 r, __m128 tabscale, float * VFtab, int offset, __m128 c6, __m128 c12, __m128 *vvdwtot)
2119 {
2120     __m128  rt,eps,eps2,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2121         __m128i n0;
2122         int     n_a,n_b,n_c,n_d;
2123
2124     rt       = _mm_mul_ps(r,tabscale);
2125         n0       = _mm_cvttps_epi32(rt);
2126         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2127         eps2     = _mm_mul_ps(eps,eps);
2128
2129         /* Extract indices from n0 */
2130         n_a      = gmx_mm_extract_epi32(n0,0);
2131         n_b      = gmx_mm_extract_epi32(n0,1);
2132         n_c      = gmx_mm_extract_epi32(n0,2);
2133         n_d      = gmx_mm_extract_epi32(n0,3);
2134
2135         /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2136          * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2137          * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2138          */
2139         Yd       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + 4*offset);
2140         Fd       = _mm_load_ps(VFtab + 4*(offset+2)* n_b + 4*offset);
2141         Gd       = _mm_load_ps(VFtab + 4*(offset+2)* n_c + 4*offset);
2142         Hd       = _mm_load_ps(VFtab + 4*(offset+2)* n_d + 4*offset);
2143         Yr       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + 4*offset + 4);
2144         Fr       = _mm_load_ps(VFtab + 4*(offset+2)* n_b + 4*offset + 4);
2145         Gr       = _mm_load_ps(VFtab + 4*(offset+2)* n_c + 4*offset + 4);
2146         Hr       = _mm_load_ps(VFtab + 4*(offset+2)* n_d + 4*offset + 4);
2147         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2148         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2149         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
2150         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
2151         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
2152         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
2153         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
2154         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
2155         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2156         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2157         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2158
2159         Fd        = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2160         Fr        = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2161
2162         return _mm_mul_ps( _mm_add_ps(Fd,Fr),tabscale);
2163 }
2164
2165
2166 /* Return force should be multiplied by -rinv to get fscal */
2167 static inline __m128
2168 gmx_mm_int_4_table_coulomb_and_lj_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 c6, __m128 c12,
2169                                                                   __m128 *vctot, __m128 *vvdwtot)
2170 {
2171     __m128  rt,eps,eps2,vcoul,Yc,Fc,Gc,Hc,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2172         __m128i n0;
2173         int     n_a,n_b,n_c,n_d;
2174
2175     rt       = _mm_mul_ps(r,tabscale);
2176         n0       = _mm_cvttps_epi32(rt);
2177         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2178         eps2     = _mm_mul_ps(eps,eps);
2179
2180         /* Extract indices from n0 */
2181         n_a      = gmx_mm_extract_epi32(n0,0);
2182         n_b      = gmx_mm_extract_epi32(n0,1);
2183         n_c      = gmx_mm_extract_epi32(n0,2);
2184         n_d      = gmx_mm_extract_epi32(n0,3);
2185
2186
2187         Yc       = _mm_load_ps(VFtab + 12* n_a);
2188         Fc       = _mm_load_ps(VFtab + 12* n_b);
2189         Gc       = _mm_load_ps(VFtab + 12* n_c);
2190         Hc       = _mm_load_ps(VFtab + 12* n_d);
2191         Yd       = _mm_load_ps(VFtab + 12* n_a + 4);
2192         Fd       = _mm_load_ps(VFtab + 12* n_b + 4);
2193         Gd       = _mm_load_ps(VFtab + 12* n_c + 4);
2194         Hd       = _mm_load_ps(VFtab + 12* n_d + 4);
2195         Yr       = _mm_load_ps(VFtab + 12* n_a + 8);
2196         Fr       = _mm_load_ps(VFtab + 12* n_b + 8);
2197         Gr       = _mm_load_ps(VFtab + 12* n_c + 8);
2198         Hr       = _mm_load_ps(VFtab + 12* n_d + 8);
2199         _MM_TRANSPOSE4_PS(Yc,Fc,Gc,Hc);
2200         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2201         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2202         Hc       = _mm_mul_ps(Hc,eps2);              /* Heps2 */
2203         Gc       = _mm_mul_ps(Gc,eps);               /* Geps  */
2204         Fc       = _mm_add_ps(Fc, _mm_add_ps(Gc,Hc));  /* Fp    */
2205         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
2206         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
2207         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
2208         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
2209         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
2210         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
2211
2212         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Yc, _mm_mul_ps(eps,Fc)));
2213         *vctot   = _mm_add_ps(*vctot,vcoul);
2214
2215         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2216         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2217         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2218
2219         Fc       = _mm_mul_ps(qq, _mm_add_ps(Fc, _mm_add_ps(Gc, _mm_add_ps(Hc,Hc))));
2220         Fd       = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2221         Fr       = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2222
2223         return _mm_mul_ps( _mm_add_ps(Fc,_mm_add_ps(Fd,Fr)),tabscale);
2224 }
2225
2226
2227
2228 /* Return force should be multiplied by -rinv to get fscal */
2229 static inline __m128
2230 gmx_mm_int_3_table_coulomb_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 *vctot)
2231 {
2232     __m128  rt,eps,eps2,Y,F,G,H,vcoul;
2233         __m128i n0;
2234         int     n_a,n_b,n_c;
2235
2236     rt       = _mm_mul_ps(r,tabscale);
2237         n0       = _mm_cvttps_epi32(rt);
2238         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2239         eps2     = _mm_mul_ps(eps,eps);
2240
2241         /* Extract indices from n0 */
2242         n_a      = gmx_mm_extract_epi32(n0,0);
2243         n_b      = gmx_mm_extract_epi32(n0,1);
2244         n_c      = gmx_mm_extract_epi32(n0,2);
2245         Y        = _mm_load_ps(VFtab + 4* n_a);
2246         F        = _mm_load_ps(VFtab + 4* n_b);
2247         G        = _mm_load_ps(VFtab + 4* n_c);
2248         H        = _mm_setzero_ps();
2249         _MM_TRANSPOSE4_PS(Y,F,G,H);
2250         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
2251         G        = _mm_mul_ps(G,eps);               /* Geps  */
2252         F        = _mm_add_ps(F, _mm_add_ps(G,H));  /* Fp    */
2253         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Y, _mm_mul_ps(eps,F)));
2254         *vctot   = _mm_add_ps(*vctot,vcoul);
2255
2256         F        = _mm_mul_ps(qq, _mm_add_ps(F, _mm_add_ps(G, _mm_add_ps(H,H))));
2257
2258         return _mm_mul_ps(F,tabscale);
2259 }
2260
2261
2262
2263 /* Return force should be multiplied by -rinv to get fscal */
2264 static inline __m128
2265 gmx_mm_int_3_table_lj_ps(__m128 r, __m128 tabscale, float * VFtab, int offset, __m128 c6, __m128 c12, __m128 *vvdwtot)
2266 {
2267     __m128  rt,eps,eps2,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2268         __m128i n0;
2269         int     n_a,n_b,n_c;
2270
2271     rt       = _mm_mul_ps(r,tabscale);
2272         n0       = _mm_cvttps_epi32(rt);
2273         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2274         eps2     = _mm_mul_ps(eps,eps);
2275
2276         /* Extract indices from n0 */
2277         n_a      = gmx_mm_extract_epi32(n0,0);
2278         n_b      = gmx_mm_extract_epi32(n0,1);
2279         n_c      = gmx_mm_extract_epi32(n0,2);
2280
2281         /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2282          * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2283          * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2284          */
2285         Yd       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset);
2286         Fd       = _mm_load_ps(VFtab + 4*(offset+2)* n_b + offset);
2287         Gd       = _mm_load_ps(VFtab + 4*(offset+2)* n_c + offset);
2288         Hd       = _mm_setzero_ps();
2289         Yr       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset + 4);
2290         Fr       = _mm_load_ps(VFtab + 4*(offset+2)* n_b + offset + 4);
2291         Gr       = _mm_load_ps(VFtab + 4*(offset+2)* n_c + offset + 4);
2292         Hr       = _mm_setzero_ps();
2293         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2294         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2295         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
2296         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
2297         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
2298         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
2299         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
2300         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
2301         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2302         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2303         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2304
2305         Fd        = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2306         Fr        = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2307
2308         return _mm_mul_ps( _mm_add_ps(Fd,Fr),tabscale);
2309 }
2310
2311
2312 /* Return force should be multiplied by -rinv to get fscal */
2313 static inline __m128
2314 gmx_mm_int_3_table_coulomb_and_lj_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 c6, __m128 c12,
2315                                                                   __m128 *vctot, __m128 *vvdwtot)
2316 {
2317     __m128  rt,eps,eps2,vcoul,Yc,Fc,Gc,Hc,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2318         __m128i n0;
2319         int     n_a,n_b,n_c;
2320
2321     rt       = _mm_mul_ps(r,tabscale);
2322         n0       = _mm_cvttps_epi32(rt);
2323         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2324         eps2     = _mm_mul_ps(eps,eps);
2325
2326         /* Extract indices from n0 */
2327         n_a      = gmx_mm_extract_epi32(n0,0);
2328         n_b      = gmx_mm_extract_epi32(n0,1);
2329         n_c      = gmx_mm_extract_epi32(n0,2);
2330
2331
2332         Yc       = _mm_load_ps(VFtab + 12* n_a);
2333         Fc       = _mm_load_ps(VFtab + 12* n_b);
2334         Gc       = _mm_load_ps(VFtab + 12* n_c);
2335         Hc       = _mm_setzero_ps();
2336         Yd       = _mm_load_ps(VFtab + 12* n_a + 4);
2337         Fd       = _mm_load_ps(VFtab + 12* n_b + 4);
2338         Gd       = _mm_load_ps(VFtab + 12* n_c + 4);
2339         Hd       = _mm_setzero_ps();
2340         Yr       = _mm_load_ps(VFtab + 12* n_a + 8);
2341         Fr       = _mm_load_ps(VFtab + 12* n_b + 8);
2342         Gr       = _mm_load_ps(VFtab + 12* n_c + 8);
2343         Hr       = _mm_setzero_ps();
2344         _MM_TRANSPOSE4_PS(Yc,Fc,Gc,Hc);
2345         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2346         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2347         Hc       = _mm_mul_ps(Hc,eps2);              /* Heps2 */
2348         Gc       = _mm_mul_ps(Gc,eps);               /* Geps  */
2349         Fc       = _mm_add_ps(Fc, _mm_add_ps(Gc,Hc));  /* Fp    */
2350         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
2351         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
2352         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
2353         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
2354         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
2355         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
2356
2357         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Yc, _mm_mul_ps(eps,Fc)));
2358         *vctot   = _mm_add_ps(*vctot,vcoul);
2359
2360         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2361         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2362         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2363
2364         Fc       = _mm_mul_ps(qq, _mm_add_ps(Fc, _mm_add_ps(Gc, _mm_add_ps(Hc,Hc))));
2365         Fd       = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2366         Fr       = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2367
2368         return _mm_mul_ps( _mm_add_ps(Fc,_mm_add_ps(Fd,Fr)),tabscale);
2369 }
2370
2371
2372
2373
2374
2375 /* Return force should be multiplied by -rinv to get fscal */
2376 static inline __m128
2377 gmx_mm_int_2_table_coulomb_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 *vctot)
2378 {
2379     __m128  rt,eps,eps2,Y,F,G,H,vcoul;
2380         __m128i n0;
2381         int     n_a,n_b;
2382
2383     rt       = _mm_mul_ps(r,tabscale);
2384         n0       = _mm_cvttps_epi32(rt);
2385         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2386         eps2     = _mm_mul_ps(eps,eps);
2387
2388         /* Extract indices from n0 */
2389         n_a      = gmx_mm_extract_epi32(n0,0);
2390         n_b      = gmx_mm_extract_epi32(n0,1);
2391         Y        = _mm_load_ps(VFtab + 4* n_a);
2392         F        = _mm_load_ps(VFtab + 4* n_b);
2393         G        = _mm_setzero_ps();
2394         H        = _mm_setzero_ps();
2395         _MM_TRANSPOSE4_PS(Y,F,G,H);
2396         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
2397         G        = _mm_mul_ps(G,eps);               /* Geps  */
2398         F        = _mm_add_ps(F, _mm_add_ps(G,H));  /* Fp    */
2399         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Y, _mm_mul_ps(eps,F)));
2400         *vctot   = _mm_add_ps(*vctot,vcoul);
2401
2402         F        = _mm_mul_ps(qq, _mm_add_ps(F, _mm_add_ps(G, _mm_add_ps(H,H))));
2403
2404         return _mm_mul_ps(F,tabscale);
2405 }
2406
2407
2408
2409 /* Return force should be multiplied by -rinv to get fscal */
2410 static inline __m128
2411 gmx_mm_int_2_table_lj_ps(__m128 r, __m128 tabscale, float * VFtab, int offset, __m128 c6, __m128 c12, __m128 *vvdwtot)
2412 {
2413     __m128  rt,eps,eps2,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2414         __m128i n0;
2415         int     n_a,n_b;
2416
2417     rt       = _mm_mul_ps(r,tabscale);
2418         n0       = _mm_cvttps_epi32(rt);
2419         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2420         eps2     = _mm_mul_ps(eps,eps);
2421
2422         /* Extract indices from n0 */
2423         n_a      = gmx_mm_extract_epi32(n0,0);
2424         n_b      = gmx_mm_extract_epi32(n0,1);
2425
2426         /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2427          * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2428          * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2429          */
2430         Yd       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset);
2431         Fd       = _mm_load_ps(VFtab + 4*(offset+2)* n_b + offset);
2432         Gd       = _mm_setzero_ps();
2433         Hd       = _mm_setzero_ps();
2434         Yr       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset + 4);
2435         Fr       = _mm_load_ps(VFtab + 4*(offset+2)* n_b + offset + 4);
2436         Gr       = _mm_setzero_ps();
2437         Hr       = _mm_setzero_ps();
2438         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2439         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2440         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
2441         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
2442         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
2443         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
2444         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
2445         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
2446         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2447         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2448         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2449
2450         Fd        = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2451         Fr        = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2452
2453         return _mm_mul_ps( _mm_add_ps(Fd,Fr),tabscale);
2454 }
2455
2456
2457 /* Return force should be multiplied by -rinv to get fscal */
2458 static inline __m128
2459 gmx_mm_int_2_table_coulomb_and_lj_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 c6, __m128 c12,
2460                                                                   __m128 *vctot, __m128 *vvdwtot)
2461 {
2462     __m128  rt,eps,eps2,vcoul,Yc,Fc,Gc,Hc,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2463         __m128i n0;
2464         int     n_a,n_b;
2465
2466     rt       = _mm_mul_ps(r,tabscale);
2467         n0       = _mm_cvttps_epi32(rt);
2468         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2469         eps2     = _mm_mul_ps(eps,eps);
2470
2471         /* Extract indices from n0 */
2472         n_a      = gmx_mm_extract_epi32(n0,0);
2473         n_b      = gmx_mm_extract_epi32(n0,1);
2474
2475         Yc       = _mm_load_ps(VFtab + 12* n_a);
2476         Fc       = _mm_load_ps(VFtab + 12* n_b);
2477         Gc       = _mm_setzero_ps();
2478         Hc       = _mm_setzero_ps();
2479         Yd       = _mm_load_ps(VFtab + 12* n_a + 4);
2480         Fd       = _mm_load_ps(VFtab + 12* n_b + 4);
2481         Gd       = _mm_setzero_ps();
2482         Hd       = _mm_setzero_ps();
2483         Yr       = _mm_load_ps(VFtab + 12* n_a + 8);
2484         Fr       = _mm_load_ps(VFtab + 12* n_b + 8);
2485         Gr       = _mm_setzero_ps();
2486         Hr       = _mm_setzero_ps();
2487         _MM_TRANSPOSE4_PS(Yc,Fc,Gc,Hc);
2488         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2489         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2490         Hc       = _mm_mul_ps(Hc,eps2);              /* Heps2 */
2491         Gc       = _mm_mul_ps(Gc,eps);               /* Geps  */
2492         Fc       = _mm_add_ps(Fc, _mm_add_ps(Gc,Hc));  /* Fp    */
2493         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
2494         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
2495         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
2496         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
2497         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
2498         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
2499
2500         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Yc, _mm_mul_ps(eps,Fc)));
2501         *vctot   = _mm_add_ps(*vctot,vcoul);
2502
2503         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2504         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2505         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2506
2507         Fc       = _mm_mul_ps(qq, _mm_add_ps(Fc, _mm_add_ps(Gc, _mm_add_ps(Hc,Hc))));
2508         Fd       = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2509         Fr       = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2510
2511         return _mm_mul_ps( _mm_add_ps(Fc,_mm_add_ps(Fd,Fr)),tabscale);
2512 }
2513
2514
2515
2516
2517 /* Return force should be multiplied by -rinv to get fscal */
2518 static inline __m128
2519 gmx_mm_int_1_table_coulomb_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 *vctot)
2520 {
2521     __m128  rt,eps,eps2,Y,F,G,H,vcoul;
2522         __m128i n0;
2523         int     n_a;
2524
2525     rt       = _mm_mul_ps(r,tabscale);
2526         n0       = _mm_cvttps_epi32(rt);
2527         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2528         eps2     = _mm_mul_ps(eps,eps);
2529
2530         /* Extract indices from n0 */
2531         n_a      = gmx_mm_extract_epi32(n0,0);
2532         Y        = _mm_load_ps(VFtab + 4* n_a);
2533         F        = _mm_setzero_ps();
2534         G        = _mm_setzero_ps();
2535         H        = _mm_setzero_ps();
2536         _MM_TRANSPOSE4_PS(Y,F,G,H);
2537         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
2538         G        = _mm_mul_ps(G,eps);               /* Geps  */
2539         F        = _mm_add_ps(F, _mm_add_ps(G,H));  /* Fp    */
2540         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Y, _mm_mul_ps(eps,F)));
2541         *vctot   = _mm_add_ps(*vctot,vcoul);
2542
2543         F        = _mm_mul_ps(qq, _mm_add_ps(F, _mm_add_ps(G, _mm_add_ps(H,H))));
2544
2545         return _mm_mul_ps(F,tabscale);
2546 }
2547
2548
2549
2550 /* Return force should be multiplied by -rinv to get fscal */
2551 static inline __m128
2552 gmx_mm_int_1_table_lj_ps(__m128 r, __m128 tabscale, float * VFtab, int offset, __m128 c6, __m128 c12, __m128 *vvdwtot)
2553 {
2554     __m128  rt,eps,eps2,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2555         __m128i n0;
2556         int     n_a;
2557
2558     rt       = _mm_mul_ps(r,tabscale);
2559         n0       = _mm_cvttps_epi32(rt);
2560         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2561         eps2     = _mm_mul_ps(eps,eps);
2562
2563         /* Extract indices from n0 */
2564         n_a      = gmx_mm_extract_epi32(n0,0);
2565
2566         /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2567          * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2568          * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2569          */
2570         Yd       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset);
2571         Fd       = _mm_setzero_ps();
2572         Gd       = _mm_setzero_ps();
2573         Hd       = _mm_setzero_ps();
2574         Yr       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset + 4);
2575         Fr       = _mm_setzero_ps();
2576         Gr       = _mm_setzero_ps();
2577         Hr       = _mm_setzero_ps();
2578         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2579         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2580         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
2581         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
2582         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
2583         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
2584         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
2585         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
2586         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2587         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2588         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2589
2590         Fd        = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2591         Fr        = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2592
2593         return _mm_mul_ps( _mm_add_ps(Fd,Fr),tabscale);
2594 }
2595
2596
2597 /* Return force should be multiplied by -rinv to get fscal */
2598 static inline __m128
2599 gmx_mm_int_1_table_coulomb_and_lj_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 c6, __m128 c12,
2600                                                                          __m128 *vctot, __m128 *vvdwtot)
2601 {
2602     __m128  rt,eps,eps2,vcoul,Yc,Fc,Gc,Hc,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2603         __m128i n0;
2604         int     n_a;
2605
2606     rt       = _mm_mul_ps(r,tabscale);
2607         n0       = _mm_cvttps_epi32(rt);
2608         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2609         eps2     = _mm_mul_ps(eps,eps);
2610
2611         /* Extract indices from n0 */
2612         n_a      = gmx_mm_extract_epi32(n0,0);
2613
2614         Yc       = _mm_load_ps(VFtab + 12* n_a);
2615         Fc       = _mm_setzero_ps();
2616         Gc       = _mm_setzero_ps();
2617         Hc       = _mm_setzero_ps();
2618         Yd       = _mm_load_ps(VFtab + 12* n_a + 4);
2619         Fd       = _mm_setzero_ps();
2620         Gd       = _mm_setzero_ps();
2621         Hd       = _mm_setzero_ps();
2622         Yr       = _mm_load_ps(VFtab + 12* n_a + 8);
2623         Fr       = _mm_setzero_ps();
2624         Gr       = _mm_setzero_ps();
2625         Hr       = _mm_setzero_ps();
2626         _MM_TRANSPOSE4_PS(Yc,Fc,Gc,Hc);
2627         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2628         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2629         Hc       = _mm_mul_ps(Hc,eps2);              /* Heps2 */
2630         Gc       = _mm_mul_ps(Gc,eps);               /* Geps  */
2631         Fc       = _mm_add_ps(Fc, _mm_add_ps(Gc,Hc));  /* Fp    */
2632         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
2633         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
2634         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
2635         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
2636         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
2637         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
2638
2639         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Yc, _mm_mul_ps(eps,Fc)));
2640         *vctot   = _mm_add_ps(*vctot,vcoul);
2641
2642         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2643         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2644         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2645
2646         Fc       = _mm_mul_ps(qq, _mm_add_ps(Fc, _mm_add_ps(Gc, _mm_add_ps(Hc,Hc))));
2647         Fd       = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2648         Fr       = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2649
2650         return _mm_mul_ps( _mm_add_ps(Fc,_mm_add_ps(Fd,Fr)),tabscale);
2651 }
2652
2653
2654
2655
2656
2657 /* Return force should be multiplied by +rinv to get fscal */
2658 static inline __m128
2659 gmx_mm_int_4_genborn_ps(__m128 r, __m128 isai,
2660                                                 float * isaj1, float *isaj2, float *isaj3, float *isaj4,
2661                                                 __m128 gbtabscale, float * GBtab, __m128 qq, __m128 *dvdasum,
2662                                                 float *dvdaj1, float *dvdaj2, float *dvdaj3, float *dvdaj4,
2663                                                 __m128 *vgbtot)
2664 {
2665         const __m128 half  = {0.5,0.5,0.5,0.5};
2666
2667     __m128  rt,eps,eps2,Y,F,G,H,VV,FF,ftmp,isaprod,t2,t3,t4,isaj,vgb,dvdatmp;
2668         __m128i n0;
2669         int     n_a,n_b,n_c,n_d;
2670
2671         /* Assemble isaj */
2672         isaj     = _mm_load_ss(isaj1);
2673         t2       = _mm_load_ss(isaj2);
2674         t3       = _mm_load_ss(isaj3);
2675         t4       = _mm_load_ss(isaj4);
2676         isaj     = _mm_unpacklo_ps(isaj,t2);  /* - - t2 t1 */
2677         t3       = _mm_unpacklo_ps(t3,t4);  /* - - t4 t3 */
2678         isaj     = _mm_movelh_ps(isaj,t3); /* t4 t3 t2 t1 */
2679
2680         isaprod     = _mm_mul_ps(isai,isaj);
2681         qq          = _mm_mul_ps(qq,isaprod);
2682         gbtabscale  = _mm_mul_ps( isaprod, gbtabscale );
2683
2684         rt       = _mm_mul_ps(r,gbtabscale);
2685         n0       = _mm_cvttps_epi32(rt);
2686         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2687         eps2     = _mm_mul_ps(eps,eps);
2688
2689         /* Extract indices from n0 */
2690         n_a      = gmx_mm_extract_epi32(n0,0);
2691         n_b      = gmx_mm_extract_epi32(n0,1);
2692         n_c      = gmx_mm_extract_epi32(n0,2);
2693         n_d      = gmx_mm_extract_epi32(n0,3);
2694         Y        = _mm_load_ps(GBtab + 4* n_a);
2695         F        = _mm_load_ps(GBtab + 4* n_b);
2696         G        = _mm_load_ps(GBtab + 4* n_c);
2697         H        = _mm_load_ps(GBtab + 4* n_d);
2698         _MM_TRANSPOSE4_PS(Y,F,G,H);
2699         G        = _mm_mul_ps(G,eps);               /* Geps  */
2700         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
2701         F        = _mm_add_ps(_mm_add_ps(F,G),H);  /* Fp    */
2702
2703         VV       = _mm_add_ps(Y, _mm_mul_ps(eps,F));
2704         FF       = _mm_add_ps(_mm_add_ps(F,G), _mm_add_ps(H,H));
2705
2706         vgb      = _mm_mul_ps(qq, VV);
2707         *vgbtot  = _mm_sub_ps(*vgbtot,vgb); /* Yes, the sign is correct */
2708
2709         ftmp     = _mm_mul_ps(_mm_mul_ps(qq, FF), gbtabscale);
2710
2711         dvdatmp  = _mm_mul_ps(half, _mm_add_ps(vgb,_mm_mul_ps(ftmp,r)));
2712
2713         *dvdasum = _mm_add_ps(*dvdasum,dvdatmp);
2714
2715         dvdatmp  = _mm_mul_ps(_mm_mul_ps(dvdatmp,isaj), isaj);
2716
2717         /* Update 4 dada[j] values */
2718         Y        = _mm_load_ss(dvdaj1);
2719         F        = _mm_load_ss(dvdaj2);
2720         G        = _mm_load_ss(dvdaj3);
2721         H        = _mm_load_ss(dvdaj4);
2722         t3       = _mm_movehl_ps(_mm_setzero_ps(),dvdatmp);
2723         t2       = _mm_shuffle_ps(dvdatmp,dvdatmp,_MM_SHUFFLE(0,0,0,1));
2724         t4       = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,1));
2725
2726         _mm_store_ss( dvdaj1 , _mm_add_ss( Y, dvdatmp ) );
2727         _mm_store_ss( dvdaj2 , _mm_add_ss( F, t2 ) );
2728         _mm_store_ss( dvdaj3 , _mm_add_ss( G, t3 ) );
2729         _mm_store_ss( dvdaj4 , _mm_add_ss( H, t4 ) );
2730
2731         return ftmp;
2732 }
2733
2734
2735
2736 /* Return force should be multiplied by +rinv to get fscal */
2737 static inline __m128
2738 gmx_mm_int_3_genborn_ps(__m128 r, __m128 isai,
2739                                                 float * isaj1, float *isaj2, float *isaj3,
2740                                                 __m128 gbtabscale, float * GBtab, __m128 qq, __m128 *dvdasum,
2741                                                 float *dvdaj1, float *dvdaj2, float *dvdaj3,
2742                                                 __m128 *vgbtot)
2743 {
2744         const __m128 half  = {0.5,0.5,0.5,0.5};
2745
2746     __m128  rt,eps,eps2,Y,F,G,H,VV,FF,ftmp,isaprod,t2,t3,t4,isaj,vgb,dvdatmp;
2747         __m128i n0;
2748         int     n_a,n_b,n_c,n_d;
2749
2750         /* Assemble isaj */
2751         isaj     = _mm_load_ss(isaj1);
2752         t2       = _mm_load_ss(isaj2);
2753         t3       = _mm_load_ss(isaj3);
2754         isaj     = _mm_unpacklo_ps(isaj,t2);  /* - - t2 t1 */
2755         t3       = _mm_unpacklo_ps(t3,t3);  /* - - t3 t3 */
2756         isaj     = _mm_movelh_ps(isaj,t3); /* t3 t3 t2 t1 */
2757
2758         isaprod     = _mm_mul_ps(isai,isaj);
2759         qq          = _mm_mul_ps(qq,isaprod);
2760         gbtabscale  = _mm_mul_ps( isaprod, gbtabscale );
2761
2762         rt       = _mm_mul_ps(r,gbtabscale);
2763         n0       = _mm_cvttps_epi32(rt);
2764         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2765         eps2     = _mm_mul_ps(eps,eps);
2766
2767         /* Extract indices from n0 */
2768         n_a      = gmx_mm_extract_epi32(n0,0);
2769         n_b      = gmx_mm_extract_epi32(n0,1);
2770         n_c      = gmx_mm_extract_epi32(n0,2);
2771         Y        = _mm_load_ps(GBtab + 4* n_a);
2772         F        = _mm_load_ps(GBtab + 4* n_b);
2773         G        = _mm_load_ps(GBtab + 4* n_c);
2774         H        = _mm_setzero_ps();
2775         _MM_TRANSPOSE4_PS(Y,F,G,H);
2776         G        = _mm_mul_ps(G,eps);               /* Geps  */
2777         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
2778         F        = _mm_add_ps(_mm_add_ps(F,G),H);  /* Fp    */
2779
2780         VV       = _mm_add_ps(Y, _mm_mul_ps(eps,F));
2781         FF       = _mm_add_ps(_mm_add_ps(F,G), _mm_add_ps(H,H));
2782
2783         vgb      = _mm_mul_ps(qq, VV);
2784         *vgbtot  = _mm_sub_ps(*vgbtot,vgb); /* Yes, the sign is correct */
2785
2786         ftmp     = _mm_mul_ps(_mm_mul_ps(qq, FF), gbtabscale);
2787
2788         dvdatmp  = _mm_mul_ps(half, _mm_add_ps(vgb,_mm_mul_ps(ftmp,r)));
2789
2790         *dvdasum = _mm_add_ps(*dvdasum,dvdatmp);
2791
2792         dvdatmp  = _mm_mul_ps(_mm_mul_ps(dvdatmp,isaj), isaj);
2793
2794         /* Update 3 dada[j] values */
2795         Y        = _mm_load_ss(dvdaj1);
2796         F        = _mm_load_ss(dvdaj2);
2797         G        = _mm_load_ss(dvdaj3);
2798         t3       = _mm_movehl_ps(_mm_setzero_ps(),dvdatmp);
2799         t2       = _mm_shuffle_ps(dvdatmp,dvdatmp,_MM_SHUFFLE(0,0,0,1));
2800
2801         _mm_store_ss( dvdaj1 , _mm_add_ss( Y, dvdatmp ) );
2802         _mm_store_ss( dvdaj2 , _mm_add_ss( F, t2 ) );
2803         _mm_store_ss( dvdaj3 , _mm_add_ss( G, t3 ) );
2804
2805         return ftmp;
2806 }
2807
2808
2809
2810
2811 /* Return force should be multiplied by +rinv to get fscal */
2812 static inline __m128
2813 gmx_mm_int_2_genborn_ps(__m128 r, __m128 isai,
2814                                                 float * isaj1, float *isaj2,
2815                                                 __m128 gbtabscale, float * GBtab, __m128 qq, __m128 *dvdasum,
2816                                                 float *dvdaj1, float *dvdaj2,
2817                                                 __m128 *vgbtot)
2818 {
2819         const __m128 half  = {0.5,0.5,0.5,0.5};
2820
2821     __m128  rt,eps,eps2,Y,F,G,H,VV,FF,ftmp,isaprod,t2,t3,t4,isaj,vgb,dvdatmp;
2822         __m128i n0;
2823         int     n_a,n_b,n_c,n_d;
2824
2825         /* Assemble isaj */
2826         isaj     = _mm_load_ss(isaj1);
2827         t2       = _mm_load_ss(isaj2);
2828         isaj     = _mm_unpacklo_ps(isaj,t2);  /* - - t2 t1 */
2829
2830         isaprod     = _mm_mul_ps(isai,isaj);
2831         qq          = _mm_mul_ps(qq,isaprod);
2832         gbtabscale  = _mm_mul_ps( isaprod, gbtabscale );
2833
2834         rt       = _mm_mul_ps(r,gbtabscale);
2835         n0       = _mm_cvttps_epi32(rt);
2836         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2837         eps2     = _mm_mul_ps(eps,eps);
2838
2839         /* Extract indices from n0 */
2840         n_a      = gmx_mm_extract_epi32(n0,0);
2841         n_b      = gmx_mm_extract_epi32(n0,1);
2842         Y        = _mm_load_ps(GBtab + 4* n_a);
2843         F        = _mm_load_ps(GBtab + 4* n_b);
2844         G        = _mm_setzero_ps();
2845         H        = _mm_setzero_ps();
2846         _MM_TRANSPOSE4_PS(Y,F,G,H);
2847         G        = _mm_mul_ps(G,eps);               /* Geps  */
2848         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
2849         F        = _mm_add_ps(_mm_add_ps(F,G),H);  /* Fp    */
2850
2851         VV       = _mm_add_ps(Y, _mm_mul_ps(eps,F));
2852         FF       = _mm_add_ps(_mm_add_ps(F,G), _mm_add_ps(H,H));
2853
2854         vgb      = _mm_mul_ps(qq, VV);
2855         *vgbtot  = _mm_sub_ps(*vgbtot,vgb); /* Yes, the sign is correct */
2856
2857         ftmp     = _mm_mul_ps(_mm_mul_ps(qq, FF), gbtabscale);
2858
2859         dvdatmp  = _mm_mul_ps(half, _mm_add_ps(vgb,_mm_mul_ps(ftmp,r)));
2860
2861         *dvdasum = _mm_add_ps(*dvdasum,dvdatmp);
2862
2863         dvdatmp  = _mm_mul_ps(_mm_mul_ps(dvdatmp,isaj), isaj);
2864
2865         /* Update 2 dada[j] values */
2866         Y        = _mm_load_ss(dvdaj1);
2867         F        = _mm_load_ss(dvdaj2);
2868         t2       = _mm_shuffle_ps(dvdatmp,dvdatmp,_MM_SHUFFLE(0,0,0,1));
2869
2870         _mm_store_ss( dvdaj1 , _mm_add_ss( Y, dvdatmp ) );
2871         _mm_store_ss( dvdaj2 , _mm_add_ss( F, t2 ) );
2872
2873         return ftmp;
2874 }
2875
2876 /* Return force should be multiplied by +rinv to get fscal */
2877 static inline __m128
2878 gmx_mm_int_1_genborn_ps(__m128 r, __m128 isai,
2879                                                 float * isaj1,
2880                                                 __m128 gbtabscale, float * GBtab, __m128 qq, __m128 *dvdasum,
2881                                                 float *dvdaj1,
2882                                                 __m128 *vgbtot)
2883 {
2884         const __m128 half  = {0.5,0.5,0.5,0.5};
2885
2886     __m128  rt,eps,eps2,Y,F,G,H,VV,FF,ftmp,isaprod,t2,t3,t4,isaj,vgb,dvdatmp;
2887         __m128i n0;
2888         int     n_a,n_b,n_c,n_d;
2889
2890         /* Assemble isaj */
2891         isaj     = _mm_load_ss(isaj1);
2892
2893         isaprod     = _mm_mul_ps(isai,isaj);
2894         qq          = _mm_mul_ps(qq,isaprod);
2895         gbtabscale  = _mm_mul_ps( isaprod, gbtabscale );
2896
2897         rt       = _mm_mul_ps(r,gbtabscale);
2898         n0       = _mm_cvttps_epi32(rt);
2899         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2900         eps2     = _mm_mul_ps(eps,eps);
2901
2902         /* Extract indices from n0 */
2903         n_a      = gmx_mm_extract_epi32(n0,0);
2904         Y        = _mm_load_ps(GBtab + 4* n_a);
2905         F        = _mm_setzero_ps();
2906         G        = _mm_setzero_ps();
2907         H        = _mm_setzero_ps();
2908         _MM_TRANSPOSE4_PS(Y,F,G,H);
2909         G        = _mm_mul_ps(G,eps);               /* Geps  */
2910         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
2911         F        = _mm_add_ps(_mm_add_ps(F,G),H);  /* Fp    */
2912
2913         VV       = _mm_add_ps(Y, _mm_mul_ps(eps,F));
2914         FF       = _mm_add_ps(_mm_add_ps(F,G), _mm_add_ps(H,H));
2915
2916         vgb      = _mm_mul_ps(qq, VV);
2917         *vgbtot  = _mm_sub_ps(*vgbtot,vgb); /* Yes, the sign is correct */
2918
2919         ftmp     = _mm_mul_ps(_mm_mul_ps(qq, FF), gbtabscale);
2920
2921         dvdatmp  = _mm_mul_ps(half, _mm_add_ps(vgb,_mm_mul_ps(ftmp,r)));
2922
2923         *dvdasum = _mm_add_ps(*dvdasum,dvdatmp);
2924
2925         dvdatmp  = _mm_mul_ps(_mm_mul_ps(dvdatmp,isaj), isaj);
2926
2927         /* Update 1 dada[j] values */
2928         Y        = _mm_load_ss(dvdaj1);
2929
2930         _mm_store_ss( dvdaj1 , _mm_add_ss( Y, dvdatmp ) );
2931
2932         return ftmp;
2933 }
2934
2935
2936
2937
2938
2939 static inline void
2940 gmx_mm_update_iforce_1atom_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
2941                               float *fptr,
2942                               float *fshiftptr)
2943 {
2944         __m128 t1,t2,t3;
2945
2946 #ifdef GMX_SSE3
2947         fix1 = _mm_hadd_ps(fix1,fix1);
2948         fiy1 = _mm_hadd_ps(fiy1,fiz1);
2949
2950         fix1 = _mm_hadd_ps(fix1,fiy1); /* fiz1 fiy1 fix1 fix1 */
2951 #else
2952         /* SSE2 */
2953         /* transpose data */
2954         t1 = fix1;
2955         _MM_TRANSPOSE4_PS(fix1,t1,fiy1,fiz1);
2956         fix1 = _mm_add_ps(_mm_add_ps(fix1,t1), _mm_add_ps(fiy1,fiz1));
2957 #endif
2958         t2 = _mm_load_ss(fptr);
2959         t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
2960         t3 = _mm_load_ss(fshiftptr);
2961         t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
2962
2963         t2 = _mm_add_ps(t2,fix1);
2964         t3 = _mm_add_ps(t3,fix1);
2965
2966         _mm_store_ss(fptr,t2);
2967         _mm_storeh_pi((__m64 *)(fptr+1),t2);
2968         _mm_store_ss(fshiftptr,t3);
2969         _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
2970 }
2971
2972 static inline void
2973 gmx_mm_update_iforce_2atoms_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
2974                                __m128 fix2, __m128 fiy2, __m128 fiz2,
2975                                float *fptr,
2976                                float *fshiftptr)
2977 {
2978         __m128 t1,t2,t4;
2979
2980 #ifdef GMX_SSE3
2981         fix1 = _mm_hadd_ps(fix1,fiy1);
2982         fiz1 = _mm_hadd_ps(fiz1,fix2);
2983         fiy2 = _mm_hadd_ps(fiy2,fiz2);
2984
2985         fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
2986         fiy2 = _mm_hadd_ps(fiy2,fiy2); /*  -    -   fiz2 fiy2 */
2987 #else
2988         /* SSE2 */
2989         /* transpose data */
2990         _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
2991         t1 = _mm_unpacklo_ps(fiy2,fiz2);
2992         t2 = _mm_unpackhi_ps(fiy2,fiz2);
2993
2994         fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
2995         t1   = _mm_add_ps(t1,t2);
2996         t2   = _mm_movehl_ps(t2,t1);
2997         fiy2 = _mm_add_ps(t1,t2);
2998 #endif
2999         _mm_storeu_ps(fptr,   _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
3000         t1 = _mm_loadl_pi(t1,(__m64 *)(fptr+4));
3001         _mm_storel_pi((__m64 *)(fptr+4), _mm_add_ps(fiy2,t1));
3002
3003         t4 = _mm_load_ss(fshiftptr+2);
3004         t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
3005
3006         t1 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,2));   /* fiy2  -   fix2 fiz1 */
3007         t1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,1,0,0));       /* fiy2 fix2  -   fiz1 */
3008         t2 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(1,0,0,1));   /* fiy1 fix1  -   fiz2 */
3009
3010         t1 = _mm_add_ps(t1,t2);
3011         t1 = _mm_add_ps(t1,t4); /* y x - z */
3012
3013         _mm_store_ss(fshiftptr+2,t1);
3014         _mm_storeh_pi((__m64 *)(fshiftptr),t1);
3015 }
3016
3017
3018
3019 static inline void
3020 gmx_mm_update_iforce_3atoms_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
3021                                __m128 fix2, __m128 fiy2, __m128 fiz2,
3022                                __m128 fix3, __m128 fiy3, __m128 fiz3,
3023                                float *fptr,
3024                                float *fshiftptr)
3025 {
3026         __m128 t1,t2,t3,t4;
3027
3028 #ifdef GMX_SSE3
3029         fix1 = _mm_hadd_ps(fix1,fiy1);
3030         fiz1 = _mm_hadd_ps(fiz1,fix2);
3031         fiy2 = _mm_hadd_ps(fiy2,fiz2);
3032         fix3 = _mm_hadd_ps(fix3,fiy3);
3033         fiz3 = _mm_hadd_ps(fiz3,fiz3);
3034
3035         fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
3036         fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
3037         fiz3 = _mm_hadd_ps(fiz3,fiz3); /*  -    -    -   fiz3 */
3038 #else
3039         /* SSE2 */
3040         /* transpose data */
3041         _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
3042         _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);
3043         t2   = _mm_movehl_ps(_mm_setzero_ps(),fiz3);
3044         t1   = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(0,0,0,1));
3045         t3   = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,1));
3046
3047         fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
3048         fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));
3049         fiz3 = _mm_add_ss(_mm_add_ps(fiz3,t1)  , _mm_add_ps(t2,t3));
3050 #endif
3051         _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
3052         _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
3053         _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
3054
3055         t4 = _mm_load_ss(fshiftptr+2);
3056         t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
3057
3058         t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));   /* fiy1 fix1  -   fiz3 */
3059         t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));   /* fiy3 fix3  -   fiz1 */
3060         t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));   /* fix2 fix2 fiy2 fiz2 */
3061         t3 = _mm_shuffle_ps(t3  ,t3  ,_MM_SHUFFLE(1,2,0,0));   /* fiy2 fix2  -   fiz2 */
3062
3063         t1 = _mm_add_ps(t1,t2);
3064         t3 = _mm_add_ps(t3,t4);
3065         t1 = _mm_add_ps(t1,t3); /* y x - z */
3066
3067         _mm_store_ss(fshiftptr+2,t1);
3068         _mm_storeh_pi((__m64 *)(fshiftptr),t1);
3069 }
3070
3071
3072 static inline void
3073 gmx_mm_update_iforce_4atoms_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
3074                                __m128 fix2, __m128 fiy2, __m128 fiz2,
3075                                __m128 fix3, __m128 fiy3, __m128 fiz3,
3076                                __m128 fix4, __m128 fiy4, __m128 fiz4,
3077                                float *fptr,
3078                                float *fshiftptr)
3079 {
3080         __m128 t1,t2,t3,t4,t5;
3081
3082 #ifdef GMX_SSE3
3083         fix1 = _mm_hadd_ps(fix1,fiy1);
3084         fiz1 = _mm_hadd_ps(fiz1,fix2);
3085         fiy2 = _mm_hadd_ps(fiy2,fiz2);
3086         fix3 = _mm_hadd_ps(fix3,fiy3);
3087         fiz3 = _mm_hadd_ps(fiz3,fix4);
3088         fiy4 = _mm_hadd_ps(fiy4,fiz4);
3089
3090         fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
3091         fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
3092         fiz3 = _mm_hadd_ps(fiz3,fiy4); /* fiz4 fiy4 fix4 fiz3 */
3093 #else
3094         /* SSE2 */
3095         /* transpose data */
3096         _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
3097         _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);
3098         _MM_TRANSPOSE4_PS(fiz3,fix4,fiy4,fiz4);
3099
3100         fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
3101         fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));
3102         fiz3 = _mm_add_ps(_mm_add_ps(fiz3,fix4), _mm_add_ps(fiy4,fiz4));
3103 #endif
3104         _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
3105         _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
3106         _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
3107
3108         t5 = _mm_load_ss(fshiftptr+2);
3109         t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
3110
3111         t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));   /* fiy1 fix1  -   fiz1 */
3112         t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));   /* fiy3 fix3  -   fiz2 */
3113         t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));   /* fiy4 fix4  -   fiz3 */
3114         t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));   /* fiy2 fiy2 fix2 fix2 */
3115         t4 = _mm_shuffle_ps(fiz3,t4  ,_MM_SHUFFLE(2,0,3,3));   /* fiy2 fix2  -   fiz4 */
3116
3117         t1 = _mm_add_ps(t1,t2);
3118         t3 = _mm_add_ps(t3,t4);
3119         t1 = _mm_add_ps(t1,t3); /* y x - z */
3120         t5 = _mm_add_ps(t5,t1);
3121
3122         _mm_store_ss(fshiftptr+2,t5);
3123         _mm_storeh_pi((__m64 *)(fshiftptr),t5);
3124 }
3125
3126
3127 static inline void
3128 gmx_mm_update_1pot_ps(__m128 pot1, float *ptr1)
3129 {
3130 #ifdef GMX_SSE3
3131         pot1 = _mm_hadd_ps(pot1,pot1);
3132         pot1 = _mm_hadd_ps(pot1,pot1);
3133 #else
3134         /* SSE2 */
3135         pot1 = _mm_add_ps(pot1,_mm_movehl_ps(pot1,pot1));
3136         pot1 = _mm_add_ps(pot1,_mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(0,0,0,1)));
3137 #endif
3138         _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1)));
3139 }
3140
3141
3142 static inline void
3143 gmx_mm_update_2pot_ps(__m128 pot1, float *ptr1, __m128 pot2, float *ptr2)
3144 {
3145 #ifdef GMX_SSE3
3146         pot1 = _mm_hadd_ps(pot1,pot2);
3147         pot1 = _mm_hadd_ps(pot1,pot1);
3148         pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(0,0,0,1));
3149 #else
3150         /* SSE2 */
3151         __m128 t1,t2;
3152         t1   = _mm_movehl_ps(pot2,pot1); /* 2d 2c 1d 1c */
3153         t2   = _mm_movelh_ps(pot1,pot2); /* 2b 2a 1b 1a */
3154         t1   = _mm_add_ps(t1,t2);       /* 2  2  1  1  */
3155         t2   = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,1,1));
3156         pot1 = _mm_add_ps(t1,t2);       /* -  2  -  1  */
3157         pot2 = _mm_movehl_ps(t2,pot1);    /* -  -  -  2  */
3158 #endif
3159
3160         _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1)));
3161         _mm_store_ss(ptr2,_mm_add_ss(pot2,_mm_load_ss(ptr2)));
3162 }
3163
3164
3165 static inline void
3166 gmx_mm_update_4pot_ps(__m128 pot1, float *ptr1, __m128 pot2, float *ptr2, __m128 pot3, float *ptr3, __m128 pot4, float *ptr4)
3167 {
3168     _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
3169
3170     pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
3171     pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(1,1,1,1));
3172     pot3 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(2,2,2,2));
3173     pot4 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(3,3,3,3));
3174
3175         _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1)));
3176         _mm_store_ss(ptr2,_mm_add_ss(pot2,_mm_load_ss(ptr2)));
3177         _mm_store_ss(ptr3,_mm_add_ss(pot3,_mm_load_ss(ptr3)));
3178         _mm_store_ss(ptr4,_mm_add_ss(pot4,_mm_load_ss(ptr4)));
3179 }
3180