memcpy: hide some memory latencies
[nova-simd.git] / simd_pan.hpp
blobe79a387e15f8cacfc4f62a7324d9d36a6c25c90c
1 // simd functions for panning
2 // Copyright (C) 2009, 2010 Tim Blechmann
3 //
4 // This program is free software; you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation; either version 2 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
14 // You should have received a copy of the GNU General Public License
15 // along with this program; see the file COPYING. If not, write to
16 // the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 // Boston, MA 02111-1307, USA.
20 #ifndef SIMD_PAN_HPP
21 #define SIMD_PAN_HPP
23 #include "vec.hpp"
25 #if defined(__GNUC__) && defined(NDEBUG)
26 #define always_inline inline __attribute__((always_inline))
27 #else
28 #define always_inline inline
29 #endif
31 namespace nova
34 template <typename F>
35 inline void pan2_vec(F * out0, F * out1, const F * in, F factor0, F factor1, unsigned int n)
39 F sig = *in++;
40 *out0++ = sig * factor0;
41 *out1++ = sig * factor1;
42 } while(--n);
45 template <typename F>
46 inline void pan2_vec(F * out0, F * out1, const F * in, F factor0, F slope0, F factor1, F slope1, unsigned int n)
50 F sig = *in++;
51 *out0++ = sig * factor0;
52 *out1++ = sig * factor1;
53 factor0 += slope0;
54 factor1 += slope1;
55 } while(--n);
58 namespace detail
61 template <typename F, unsigned int n>
62 struct pan2
64 static const int offset = vec<F>::size;
66 static always_inline void mp_iteration(F * out0, F * out1, const F * in, vec<F> const & factor0, vec<F> const & factor1)
68 vec<F> vin, vout0, vout1;
69 vin.load_aligned(in);
71 vout0 = vin * factor0;
72 vout1 = vin * factor1;
74 vout0.store_aligned(out0);
75 vout1.store_aligned(out1);
77 pan2<F, n-offset>::mp_iteration(out0+offset, out1+offset, in+offset, factor0, factor1);
80 static always_inline void mp_iteration(F * out0, F * out1, const F * in, vec<F> & factor0, vec<F> const & slope0,
81 vec<F> & factor1, vec<F> const & slope1)
83 vec<F> vin, vout0, vout1;
84 vin.load_aligned(in);
86 vout0 = vin * factor0;
87 vout1 = vin * factor1;
89 vout0.store_aligned(out0);
90 vout1.store_aligned(out1);
91 factor0 += slope0;
92 factor1 += slope1;
94 pan2<F, n-offset>::mp_iteration(out0+offset, out1+offset, in+offset, factor0, slope0, factor1, slope1);
99 template <typename F>
100 struct pan2<F, 0>
102 static always_inline void mp_iteration(F * out0, F * out1, const F * in, vec<F> const & factor0, vec<F> const & factor1)
105 static always_inline void mp_iteration(F * out0, F * out1, const F * in, vec<F> & factor0, vec<F> const & slope0,
106 vec<F> & factor1, vec<F> const & slope1)
110 } /* namespace detail */
112 template <typename F>
113 inline void pan2_vec_simd(F * out0, F * out1, const F * in, F factor0, F factor1, unsigned int n)
115 vec<F> vf0(factor0), vf1(factor1);
116 const int per_loop = vec<F>::objects_per_cacheline;
118 n /= per_loop;
119 do {
120 detail::pan2<F, per_loop>::mp_iteration(out0, out1, in, vf0, vf1);
121 out0 += per_loop; out1 += per_loop; in += per_loop;
122 } while(--n);
125 template <unsigned int n, typename F>
126 inline void pan2_vec_simd(F * out0, F * out1, const F * in, F factor0, F factor1)
128 vec<F> vf0(factor0), vf1(factor1);
130 detail::pan2<F, n>::mp_iteration(out0, out1, in, vf0, vf1);
133 template <typename F>
134 inline void pan2_vec_simd(F * out0, F * out1, const F * in, F factor0, F slope0, F factor1, F slope1, unsigned int n)
136 const int per_loop = vec<F>::objects_per_cacheline;
138 vec<F> vf0, vf1, vslope0, vslope1;
139 vslope0.set_vec(vf0.set_slope(factor0, slope0));
140 vslope1.set_vec(vf1.set_slope(factor1, slope1));
142 n /= per_loop;
143 do {
144 detail::pan2<F, per_loop>::mp_iteration(out0, out1, in, vf0, vslope0, vf1, vslope1);
145 out0 += per_loop; out1 += per_loop; in += per_loop;
146 } while(--n);
149 template <unsigned int n, typename F>
150 inline void pan2_vec_simd(F * out0, F * out1, const F * in, F factor0, F slope0, F factor1, F slope1)
152 vec<F> vf0, vf1, vslope0, vslope1;
153 vslope0.set_vec(vf0.set_slope(factor0, slope0));
154 vslope1.set_vec(vf1.set_slope(factor1, slope1));
156 detail::pan2<F, n>::mp_iteration(out0, out1, in, vf0, vslope0, vf1, vslope1);
159 } /* namespace nova */
161 #undef always_inline
163 #endif /* SIMD_PAN_HPP */