1 #ifndef PHASE_SHIFTER_H
2 #define PHASE_SHIFTER_H
4 #ifdef HAVE_SSE_INTRINSICS
6 #elif defined(HAVE_NEON)
14 #include "alnumbers.h"
18 /* Implements a wide-band +90 degree phase-shift. Note that this should be
19 * given one sample less of a delay (FilterSize/2 - 1) compared to the direct
20 * signal delay (FilterSize/2) to properly align.
22 template<std::size_t FilterSize
>
23 struct PhaseShifterT
{
24 static_assert(FilterSize
>= 16, "FilterSize needs to be at least 16");
25 static_assert((FilterSize
&(FilterSize
-1)) == 0, "FilterSize needs to be power-of-two");
27 alignas(16) std::array
<float,FilterSize
/2> mCoeffs
{};
31 /* Every other coefficient is 0, so we only need to calculate and store
32 * the non-0 terms and double-step over the input to apply it. The
33 * calculated coefficients are in reverse to make applying in the time-
34 * domain more efficient.
36 for(std::size_t i
{0};i
< FilterSize
/2;++i
)
38 const int k
{static_cast<int>(i
*2 + 1) - int{FilterSize
/2}};
40 /* Calculate the Blackman window value for this coefficient. */
41 const double w
{2.0*al::numbers::pi
* static_cast<double>(i
*2 + 1)
42 / double{FilterSize
}};
43 const double window
{0.3635819 - 0.4891775*std::cos(w
) + 0.1365995*std::cos(2.0*w
)
44 - 0.0106411*std::cos(3.0*w
)};
46 const double pk
{al::numbers::pi
* static_cast<double>(k
)};
47 mCoeffs
[i
] = static_cast<float>(window
* (1.0-std::cos(pk
)) / pk
);
51 void process(const al::span
<float> dst
, const al::span
<const float> src
) const;
54 #if defined(HAVE_NEON)
55 static auto unpacklo(float32x4_t a
, float32x4_t b
)
57 float32x2x2_t result
{vzip_f32(vget_low_f32(a
), vget_low_f32(b
))};
58 return vcombine_f32(result
.val
[0], result
.val
[1]);
60 static auto unpackhi(float32x4_t a
, float32x4_t b
)
62 float32x2x2_t result
{vzip_f32(vget_high_f32(a
), vget_high_f32(b
))};
63 return vcombine_f32(result
.val
[0], result
.val
[1]);
65 static auto load4(float32_t a
, float32_t b
, float32_t c
, float32_t d
)
67 float32x4_t ret
{vmovq_n_f32(a
)};
68 ret
= vsetq_lane_f32(b
, ret
, 1);
69 ret
= vsetq_lane_f32(c
, ret
, 2);
70 ret
= vsetq_lane_f32(d
, ret
, 3);
73 static void vtranspose4(float32x4_t
&x0
, float32x4_t
&x1
, float32x4_t
&x2
, float32x4_t
&x3
)
75 float32x4x2_t t0_
{vzipq_f32(x0
, x2
)};
76 float32x4x2_t t1_
{vzipq_f32(x1
, x3
)};
77 float32x4x2_t u0_
{vzipq_f32(t0_
.val
[0], t1_
.val
[0])};
78 float32x4x2_t u1_
{vzipq_f32(t0_
.val
[1], t1_
.val
[1])};
87 template<std::size_t S
>
89 void PhaseShifterT
<S
>::process(const al::span
<float> dst
, const al::span
<const float> src
) const
91 auto in
= src
.begin();
92 #ifdef HAVE_SSE_INTRINSICS
93 if(const std::size_t todo
{dst
.size()>>2})
95 auto out
= al::span
{reinterpret_cast<__m128
*>(dst
.data()), todo
};
96 std::generate(out
.begin(), out
.end(), [&in
,this]
98 __m128 r0
{_mm_setzero_ps()};
99 __m128 r1
{_mm_setzero_ps()};
100 __m128 r2
{_mm_setzero_ps()};
101 __m128 r3
{_mm_setzero_ps()};
102 for(std::size_t j
{0};j
< mCoeffs
.size();j
+=4)
104 const __m128 coeffs
{_mm_load_ps(&mCoeffs
[j
])};
105 const __m128 s0
{_mm_loadu_ps(&in
[j
*2])};
106 const __m128 s1
{_mm_loadu_ps(&in
[j
*2 + 4])};
107 const __m128 s2
{_mm_movehl_ps(_mm_movelh_ps(s1
, s1
), s0
)};
108 const __m128 s3
{_mm_loadh_pi(_mm_movehl_ps(s1
, s1
),
109 reinterpret_cast<const __m64
*>(&in
[j
*2 + 8]))};
111 __m128 s
{_mm_shuffle_ps(s0
, s1
, _MM_SHUFFLE(2, 0, 2, 0))};
112 r0
= _mm_add_ps(r0
, _mm_mul_ps(s
, coeffs
));
114 s
= _mm_shuffle_ps(s0
, s1
, _MM_SHUFFLE(3, 1, 3, 1));
115 r1
= _mm_add_ps(r1
, _mm_mul_ps(s
, coeffs
));
117 s
= _mm_shuffle_ps(s2
, s3
, _MM_SHUFFLE(2, 0, 2, 0));
118 r2
= _mm_add_ps(r2
, _mm_mul_ps(s
, coeffs
));
120 s
= _mm_shuffle_ps(s2
, s3
, _MM_SHUFFLE(3, 1, 3, 1));
121 r3
= _mm_add_ps(r3
, _mm_mul_ps(s
, coeffs
));
125 _MM_TRANSPOSE4_PS(r0
, r1
, r2
, r3
);
126 return _mm_add_ps(_mm_add_ps(r0
, r1
), _mm_add_ps(r2
, r3
));
129 if(const std::size_t todo
{dst
.size()&3})
131 auto out
= dst
.last(todo
);
132 std::generate(out
.begin(), out
.end(), [&in
,this]
134 __m128 r4
{_mm_setzero_ps()};
135 for(std::size_t j
{0};j
< mCoeffs
.size();j
+=4)
137 const __m128 coeffs
{_mm_load_ps(&mCoeffs
[j
])};
138 const __m128 s
{_mm_setr_ps(in
[j
*2], in
[j
*2 + 2], in
[j
*2 + 4], in
[j
*2 + 6])};
139 r4
= _mm_add_ps(r4
, _mm_mul_ps(s
, coeffs
));
142 r4
= _mm_add_ps(r4
, _mm_shuffle_ps(r4
, r4
, _MM_SHUFFLE(0, 1, 2, 3)));
143 r4
= _mm_add_ps(r4
, _mm_movehl_ps(r4
, r4
));
144 return _mm_cvtss_f32(r4
);
148 #elif defined(HAVE_NEON)
150 if(const std::size_t todo
{dst
.size()>>2})
152 auto out
= al::span
{reinterpret_cast<float32x4_t
*>(dst
.data()), todo
};
153 std::generate(out
.begin(), out
.end(), [&in
,this]
155 float32x4_t r0
{vdupq_n_f32(0.0f
)};
156 float32x4_t r1
{vdupq_n_f32(0.0f
)};
157 float32x4_t r2
{vdupq_n_f32(0.0f
)};
158 float32x4_t r3
{vdupq_n_f32(0.0f
)};
159 for(std::size_t j
{0};j
< mCoeffs
.size();j
+=4)
161 const float32x4_t coeffs
{vld1q_f32(&mCoeffs
[j
])};
162 const float32x4_t s0
{vld1q_f32(&in
[j
*2])};
163 const float32x4_t s1
{vld1q_f32(&in
[j
*2 + 4])};
164 const float32x4_t s2
{vcombine_f32(vget_high_f32(s0
), vget_low_f32(s1
))};
165 const float32x4_t s3
{vcombine_f32(vget_high_f32(s1
), vld1_f32(&in
[j
*2 + 8]))};
166 const float32x4x2_t values0
{vuzpq_f32(s0
, s1
)};
167 const float32x4x2_t values1
{vuzpq_f32(s2
, s3
)};
169 r0
= vmlaq_f32(r0
, values0
.val
[0], coeffs
);
170 r1
= vmlaq_f32(r1
, values0
.val
[1], coeffs
);
171 r2
= vmlaq_f32(r2
, values1
.val
[0], coeffs
);
172 r3
= vmlaq_f32(r3
, values1
.val
[1], coeffs
);
176 vtranspose4(r0
, r1
, r2
, r3
);
177 return vaddq_f32(vaddq_f32(r0
, r1
), vaddq_f32(r2
, r3
));
180 if(const std::size_t todo
{dst
.size()&3})
182 auto out
= dst
.last(todo
);
183 std::generate(out
.begin(), out
.end(), [&in
,this]
185 float32x4_t r4
{vdupq_n_f32(0.0f
)};
186 for(std::size_t j
{0};j
< mCoeffs
.size();j
+=4)
188 const float32x4_t coeffs
{vld1q_f32(&mCoeffs
[j
])};
189 const float32x4_t s
{load4(in
[j
*2], in
[j
*2 + 2], in
[j
*2 + 4], in
[j
*2 + 6])};
190 r4
= vmlaq_f32(r4
, s
, coeffs
);
193 r4
= vaddq_f32(r4
, vrev64q_f32(r4
));
194 return vget_lane_f32(vadd_f32(vget_low_f32(r4
), vget_high_f32(r4
)), 0);
200 std::generate(dst
.begin(), dst
.end(), [&in
,this]
203 for(std::size_t j
{0};j
< mCoeffs
.size();++j
)
204 ret
+= in
[j
*2] * mCoeffs
[j
];
211 #endif /* PHASE_SHIFTER_H */