simd_utils.hpp

   1 //  utilities for the simd implementation
   2 //  Copyright (C) 2008, 2009 Tim Blechmann
   3 //
   4 //  This program is free software; you can redistribute it and/or modify
   5 //  it under the terms of the GNU General Public License as published by
   6 //  the Free Software Foundation; either version 2 of the License, or
   7 //  (at your option) any later version.
   8 //
   9 //  This program is distributed in the hope that it will be useful,
  10 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 //  GNU General Public License for more details.
  13 //
  14 //  You should have received a copy of the GNU General Public License
  15 //  along with this program; see the file COPYING.  If not, write to
  16 //  the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  17 //  Boston, MA 02111-1307, USA.
  18
  19 #ifndef SIMD_UTILS_HPP
  20 #define SIMD_UTILS_HPP
  21
  22 #include <xmmintrin.h>
  23
  24 #ifdef __SSE2__
  25 #include <emmintrin.h>
  26 #endif /* __SSE2__ */
  27
  28 #ifdef __SSE4_1__
  29 #include <smmintrin.h>
  30 #endif /* __SSE41__ */
  31
  32
  33 namespace nova
  34 {
  35 namespace detail
  36 {
  37
  38 #ifdef __SSE2__
  39 inline __m128 gen_sign_mask(void)
  40 {
  41     __m128i x = _mm_setzero_si128();
  42     __m128i ones = _mm_cmpeq_epi32(x, x);
  43     return (__m128)_mm_slli_epi32 (_mm_srli_epi32(ones, 31), 31);
  44 }
  45
  46 inline __m128 gen_abs_mask(void)
  47 {
  48     __m128i x = _mm_setzero_si128();
  49     __m128i ones = _mm_cmpeq_epi32(x, x);
  50     return (__m128)_mm_srli_epi32 (_mm_slli_epi32(ones, 1), 1);
  51 }
  52
  53 inline __m128 gen_one(void)
  54 {
  55     __m128i x = _mm_setzero_si128();
  56     __m128i ones = _mm_cmpeq_epi32(x, x);
  57     return (__m128)_mm_slli_epi32 (_mm_srli_epi32(ones, 25), 23);
  58 }
  59
  60 inline __m128 gen_05(void)
  61 {
  62     __m128i x = _mm_setzero_si128();
  63     __m128i ones = _mm_cmpeq_epi32(x, x);
  64     return (__m128)_mm_slli_epi32 (_mm_srli_epi32(ones, 26), 24);
  65 }
  66
  67 #else
  68
  69 /* SSE fallback */
  70
  71 inline __m128 gen_sign_mask(void)
  72 {
  73     static const int sign_mask = 0x80000000;
  74     float * casted = (float*)(&sign_mask);
  75     return _mm_set_ps1(*casted);
  76 }
  77
  78 inline __m128 gen_abs_mask(void)
  79 {
  80     static const int abs_mask = 0x7fffffff;
  81     float * casted = (float*)(&abs_mask);
  82     return _mm_set_ps1(*casted);
  83 }
  84
  85 inline __m128 gen_one(void)
  86 {
  87     return _mm_set_ps1(1.f);
  88 }
  89
  90 inline __m128 gen_05(void)
  91 {
  92     return _mm_set_ps1(0.5f);
  93 }
  94
  95 #endif
  96
  97 inline __m128 gen_025(void)
  98 {
  99     return _mm_set_ps1(0.25f);
 100 }
 101
 102 inline float extract_0(__m128 arg)
 103 {
 104     float r;
 105     _mm_store_ss(&r, arg);
 106     return r;
 107 }
 108
 109 inline float extract_3(__m128 arg)
 110 {
 111     __m128 last = _mm_shuffle_ps(arg, arg, _MM_SHUFFLE(2, 1, 0, 3));
 112     float r;
 113     _mm_store_ss(&r, last);
 114     return r;
 115 }
 116
 117 inline float horizontal_min(__m128 args)
 118 {
 119     __m128 xmm0, xmm1;
 120     xmm0 = args;
 121     xmm1 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2,2,2,2));
 122     xmm0 = _mm_min_ps(xmm0, xmm1);
 123     xmm1 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1,1,1,1));
 124     xmm0 = _mm_min_ss(xmm0, xmm1);
 125     return extract_0(xmm0);
 126 }
 127
 128 inline float horizontal_max(__m128 args)
 129 {
 130     __m128 xmm0, xmm1;
 131     xmm0 = args;
 132     xmm1 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2,2,2,2));
 133     xmm0 = _mm_max_ps(xmm0, xmm1);
 134     xmm1 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1,1,1,1));
 135     xmm0 = _mm_max_ss(xmm0, xmm1);
 136     return extract_0(xmm0);
 137 }
 138
 139 #ifdef __SSE4_1__
 140
 141 inline __m128 select_vector(__m128 val0, __m128 val1, __m128 sel)
 142 {
 143     /* if bitmask is set, return value in val1, else value in val0 */
 144     return _mm_blendv_ps(val0, val1, sel);
 145 }
 146
 147 #else
 148
 149 inline __m128 select_vector(__m128 val0, __m128 val1, __m128 sel)
 150 {
 151     /* if bitmask is set, return value in val1, else value in val0 */
 152     return _mm_or_ps(_mm_andnot_ps(sel, val0),
 153                      _mm_and_ps(val1, sel));
 154 }
 155
 156 #endif
 157
 158 } /* namespace detail */
 159 } /* namespace nova */
 160
 161 #endif /* SIMD_UTILS_HPP */