common/pffft.cpp

   1 //$ nobt
   2
   3 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
   4  * Copyright (c) 2023  Christopher Robinson
   5  *
   6  * Based on original fortran 77 code from FFTPACKv4 from NETLIB
   7  * (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
   8  * of NCAR, in 1985.
   9  *
  10  * As confirmed by the NCAR fftpack software curators, the following
  11  * FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
  12  * released under the same terms.
  13  *
  14  * FFTPACK license:
  15  *
  16  * http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
  17  *
  18  * Copyright (c) 2004 the University Corporation for Atmospheric
  19  * Research ("UCAR"). All rights reserved. Developed by NCAR's
  20  * Computational and Information Systems Laboratory, UCAR,
  21  * www.cisl.ucar.edu.
  22  *
  23  * Redistribution and use of the Software in source and binary forms,
  24  * with or without modification, is permitted provided that the
  25  * following conditions are met:
  26  *
  27  * - Neither the names of NCAR's Computational and Information Systems
  28  * Laboratory, the University Corporation for Atmospheric Research,
  29  * nor the names of its sponsors or contributors may be used to
  30  * endorse or promote products derived from this Software without
  31  * specific prior written permission.
  32  *
  33  * - Redistributions of source code must retain the above copyright
  34  * notices, this list of conditions, and the disclaimer below.
  35  *
  36  * - Redistributions in binary form must reproduce the above copyright
  37  * notice, this list of conditions, and the disclaimer below in the
  38  * documentation and/or other materials provided with the
  39  * distribution.
  40  *
  41  * THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  42  * EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
  43  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  44  * NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
  45  * HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
  46  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  47  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  48  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
  49  * SOFTWARE.
  50  *
  51  *
  52  * PFFFT : a Pretty Fast FFT.
  53  *
  54  * This file is largerly based on the original FFTPACK implementation, modified
  55  * in order to take advantage of SIMD instructions of modern CPUs.
  56  */
  57
  58 #include "pffft.h"
  59
  60 #include <algorithm>
  61 #include <array>
  62 #include <cassert>
  63 #include <cmath>
  64 #include <cstdint>
  65 #include <cstdio>
  66 #include <cstring>
  67 #include <new>
  68 #include <utility>
  69 #include <vector>
  70
  71 #include "albit.h"
  72 #include "almalloc.h"
  73 #include "alnumbers.h"
  74 #include "alnumeric.h"
  75 #include "alspan.h"
  76 #include "opthelpers.h"
  77
  78
  79 using uint = unsigned int;
  80
  81 namespace {
  82
  83 #if defined(__GNUC__) || defined(_MSC_VER)
  84 #define RESTRICT __restrict
  85 #else
  86 #define RESTRICT
  87 #endif
  88
  89 /* Vector support macros: the rest of the code is independent of
  90  * SSE/Altivec/NEON -- adding support for other platforms with 4-element
  91  * vectors should be limited to these macros
  92  */
  93
  94 /* Define PFFFT_SIMD_DISABLE if you want to use scalar code instead of SIMD code */
  95 //#define PFFFT_SIMD_DISABLE
  96
  97 #ifndef PFFFT_SIMD_DISABLE
  98 /*
  99  * Altivec support macros
 100  */
 101 #if defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)
 102 #include <altivec.h>
 103 using v4sf = vector float;
 104 constexpr uint SimdSize{4};
 105 force_inline v4sf vzero() noexcept { return (vector float)vec_splat_u8(0); }
 106 force_inline v4sf vmul(v4sf a, v4sf b) noexcept { return vec_madd(a, b, vzero()); }
 107 force_inline v4sf vadd(v4sf a, v4sf b) noexcept { return vec_add(a, b); }
 108 force_inline v4sf vmadd(v4sf a, v4sf b, v4sf c) noexcept { return vec_madd(a, b, c); }
 109 force_inline v4sf vsub(v4sf a, v4sf b) noexcept { return vec_sub(a, b); }
 110 force_inline v4sf ld_ps1(float a) noexcept { return vec_splats(a); }
 111
 112 force_inline v4sf vset4(float a, float b, float c, float d) noexcept
 113 {
 114     /* There a more efficient way to do this? */
 115     alignas(16) std::array<float,4> vals{{a, b, c, d}};
 116     return vec_ld(0, vals.data());
 117 }
 118 force_inline v4sf vinsert0(v4sf v, float a) noexcept { return vec_insert(a, v, 0); }
 119 force_inline float vextract0(v4sf v) noexcept { return vec_extract(v, 0); }
 120
 121 force_inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 122 {
 123     out1 = vec_mergeh(in1, in2);
 124     out2 = vec_mergel(in1, in2);
 125 }
 126 force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 127 {
 128     out1 = vec_perm(in1, in2, (vector unsigned char){0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27});
 129     out2 = vec_perm(in1, in2, (vector unsigned char){4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31});
 130     out1 = tmp;
 131 }
 132
 133 force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
 134 {
 135     v4sf y0{vec_mergeh(x0, x2)};
 136     v4sf y1{vec_mergel(x0, x2)};
 137     v4sf y2{vec_mergeh(x1, x3)};
 138     v4sf y3{vec_mergel(x1, x3)};
 139     x0 = vec_mergeh(y0, y2);
 140     x1 = vec_mergel(y0, y2);
 141     x2 = vec_mergeh(y1, y3);
 142     x3 = vec_mergel(y1, y3);
 143 }
 144
 145 force_inline v4sf vswaphl(v4sf a, v4sf b) noexcept
 146 { return vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15}); }
 147
 148 /*
 149  * SSE1 support macros
 150  */
 151 #elif defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || \
 152     (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
 153
 154 #include <xmmintrin.h>
 155 using v4sf = __m128;
 156 /* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/
 157  * finalize functions anyway so you will have to work if you want to enable AVX
 158  * with its 256-bit vectors.
 159  */
 160 constexpr uint SimdSize{4};
 161 force_inline v4sf vzero() noexcept { return _mm_setzero_ps(); }
 162 force_inline v4sf vmul(v4sf a, v4sf b) noexcept { return _mm_mul_ps(a, b); }
 163 force_inline v4sf vadd(v4sf a, v4sf b) noexcept { return _mm_add_ps(a, b); }
 164 force_inline v4sf vmadd(v4sf a, v4sf b, v4sf c) noexcept { return _mm_add_ps(_mm_mul_ps(a,b), c); }
 165 force_inline v4sf vsub(v4sf a, v4sf b) noexcept { return _mm_sub_ps(a, b); }
 166 force_inline v4sf ld_ps1(float a) noexcept { return _mm_set1_ps(a); }
 167
 168 force_inline v4sf vset4(float a, float b, float c, float d) noexcept
 169 { return _mm_setr_ps(a, b, c, d); }
 170 force_inline v4sf vinsert0(const v4sf v, const float a) noexcept
 171 { return _mm_move_ss(v, _mm_set_ss(a)); }
 172 force_inline float vextract0(v4sf v) noexcept
 173 { return _mm_cvtss_f32(v); }
 174
 175 force_inline void interleave2(const v4sf in1, const v4sf in2, v4sf &out1, v4sf &out2) noexcept
 176 {
 177     out1 = _mm_unpacklo_ps(in1, in2);
 178     out2 = _mm_unpackhi_ps(in1, in2);
 179 }
 180 force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 181 {
 182     out1 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0));
 183     out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1));
 184 }
 185
 186 force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
 187 { _MM_TRANSPOSE4_PS(x0, x1, x2, x3); }
 188
 189 force_inline v4sf vswaphl(v4sf a, v4sf b) noexcept
 190 { return _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0)); }
 191
 192 /*
 193  * ARM NEON support macros
 194  */
 195 #elif defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64) || defined(_M_ARM64)
 196
 197 #include <arm_neon.h>
 198 using v4sf = float32x4_t;
 199 constexpr uint SimdSize{4};
 200 force_inline v4sf vzero() noexcept { return vdupq_n_f32(0.0f); }
 201 force_inline v4sf vmul(v4sf a, v4sf b) noexcept { return vmulq_f32(a, b); }
 202 force_inline v4sf vadd(v4sf a, v4sf b) noexcept { return vaddq_f32(a, b); }
 203 force_inline v4sf vmadd(v4sf a, v4sf b, v4sf c) noexcept { return vmlaq_f32(c, a, b); }
 204 force_inline v4sf vsub(v4sf a, v4sf b) noexcept { return vsubq_f32(a, b); }
 205 force_inline v4sf ld_ps1(float a) noexcept { return vdupq_n_f32(a); }
 206
 207 force_inline v4sf vset4(float a, float b, float c, float d) noexcept
 208 {
 209     float32x4_t ret{vmovq_n_f32(a)};
 210     ret = vsetq_lane_f32(b, ret, 1);
 211     ret = vsetq_lane_f32(c, ret, 2);
 212     ret = vsetq_lane_f32(d, ret, 3);
 213     return ret;
 214 }
 215 force_inline v4sf vinsert0(v4sf v, float a) noexcept
 216 { return vsetq_lane_f32(a, v, 0); }
 217 force_inline float vextract0(v4sf v) noexcept
 218 { return vgetq_lane_f32(v, 0); }
 219
 220 force_inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 221 {
 222     float32x4x2_t tmp{vzipq_f32(in1, in2)};
 223     out1 = tmp.val[0];
 224     out2 = tmp.val[1];
 225 }
 226 force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 227 {
 228     float32x4x2_t tmp{vuzpq_f32(in1, in2)};
 229     out1 = tmp.val[0];
 230     out2 = tmp.val[1];
 231 }
 232
 233 force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
 234 {
 235     /* marginally faster version:
 236      * asm("vtrn.32 %q0, %q1;\n"
 237      *     "vtrn.32 %q2, %q3\n
 238      *     "vswp %f0, %e2\n
 239      *     "vswp %f1, %e3"
 240      *     : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::);
 241      */
 242     float32x4x2_t t0_{vzipq_f32(x0, x2)};
 243     float32x4x2_t t1_{vzipq_f32(x1, x3)};
 244     float32x4x2_t u0_{vzipq_f32(t0_.val[0], t1_.val[0])};
 245     float32x4x2_t u1_{vzipq_f32(t0_.val[1], t1_.val[1])};
 246     x0 = u0_.val[0];
 247     x1 = u0_.val[1];
 248     x2 = u1_.val[0];
 249     x3 = u1_.val[1];
 250 }
 251
 252 force_inline v4sf vswaphl(v4sf a, v4sf b) noexcept
 253 { return vcombine_f32(vget_low_f32(b), vget_high_f32(a)); }
 254
 255 /*
 256  * Generic GCC vector macros
 257  */
 258 #elif defined(__GNUC__)
 259
 260 using v4sf [[gnu::vector_size(16), gnu::aligned(16)]] = float;
 261 constexpr uint SimdSize{4};
 262 force_inline constexpr v4sf vzero() noexcept { return v4sf{0.0f, 0.0f, 0.0f, 0.0f}; }
 263 force_inline constexpr v4sf vmul(v4sf a, v4sf b) noexcept { return a * b; }
 264 force_inline constexpr v4sf vadd(v4sf a, v4sf b) noexcept { return a + b; }
 265 force_inline constexpr v4sf vmadd(v4sf a, v4sf b, v4sf c) noexcept { return a*b + c; }
 266 force_inline constexpr v4sf vsub(v4sf a, v4sf b) noexcept { return a - b; }
 267 force_inline constexpr v4sf ld_ps1(float a) noexcept { return v4sf{a, a, a, a}; }
 268
 269 force_inline constexpr v4sf vset4(float a, float b, float c, float d) noexcept
 270 { return v4sf{a, b, c, d}; }
 271 force_inline constexpr v4sf vinsert0(v4sf v, float a) noexcept
 272 { return v4sf{a, v[1], v[2], v[3]}; }
 273 force_inline float vextract0(v4sf v) noexcept
 274 { return v[0]; }
 275
 276 force_inline v4sf unpacklo(v4sf a, v4sf b) noexcept
 277 { return v4sf{a[0], b[0], a[1], b[1]}; }
 278 force_inline v4sf unpackhi(v4sf a, v4sf b) noexcept
 279 { return v4sf{a[2], b[2], a[3], b[3]}; }
 280
 281 force_inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 282 {
 283     out1 = unpacklo(in1, in2);
 284     out2 = unpackhi(in1, in2);
 285 }
 286 force_inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 287 {
 288     out1 = v4sf{in1[0], in1[2], in2[0], in2[2]};
 289     out2 = v4sf{in1[1], in1[3], in2[1], in2[3]};
 290 }
 291
 292 force_inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
 293 {
 294     v4sf tmp0{unpacklo(x0, x1)};
 295     v4sf tmp2{unpacklo(x2, x3)};
 296     v4sf tmp1{unpackhi(x0, x1)};
 297     v4sf tmp3{unpackhi(x2, x3)};
 298     x0 = v4sf{tmp0[0], tmp0[1], tmp2[0], tmp2[1]};
 299     x1 = v4sf{tmp0[2], tmp0[3], tmp2[2], tmp2[3]};
 300     x2 = v4sf{tmp1[0], tmp1[1], tmp3[0], tmp3[1]};
 301     x3 = v4sf{tmp1[2], tmp1[3], tmp3[2], tmp3[3]};
 302 }
 303
 304 force_inline v4sf vswaphl(v4sf a, v4sf b) noexcept
 305 { return v4sf{b[0], b[1], a[2], a[3]}; }
 306
 307 #else
 308
 309 #warning "building with simd disabled !\n";
 310 #define PFFFT_SIMD_DISABLE // fallback to scalar code
 311 #endif
 312
 313 #endif /* PFFFT_SIMD_DISABLE */
 314
 315 // fallback mode for situations where SIMD is not available, use scalar mode instead
 316 #ifdef PFFFT_SIMD_DISABLE
 317 using v4sf = float;
 318 constexpr uint SimdSize{1};
 319 force_inline constexpr v4sf vmul(v4sf a, v4sf b) noexcept { return a * b; }
 320 force_inline constexpr v4sf vadd(v4sf a, v4sf b) noexcept { return a + b; }
 321 force_inline constexpr v4sf vmadd(v4sf a, v4sf b, v4sf c) noexcept { return a*b + c; }
 322 force_inline constexpr v4sf vsub(v4sf a, v4sf b) noexcept { return a - b; }
 323 force_inline constexpr v4sf ld_ps1(float a) noexcept { return a; }
 324
 325 #else
 326
 327 [[maybe_unused]] inline
 328 auto valigned(const float *ptr) noexcept -> bool
 329 {
 330     static constexpr uintptr_t alignmask{SimdSize*sizeof(float) - 1};
 331     return (reinterpret_cast<uintptr_t>(ptr) & alignmask) == 0;
 332 }
 333 #endif
 334
 335 // shortcuts for complex multiplications
 336 force_inline void vcplxmul(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
 337 {
 338     v4sf tmp{vmul(ar, bi)};
 339     ar = vsub(vmul(ar, br), vmul(ai, bi));
 340     ai = vmadd(ai, br, tmp);
 341 }
 342 force_inline void vcplxmulconj(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
 343 {
 344     v4sf tmp{vmul(ar, bi)};
 345     ar = vmadd(ai, bi, vmul(ar, br));
 346     ai = vsub(vmul(ai, br), tmp);
 347 }
 348
 349 #if !defined(PFFFT_SIMD_DISABLE)
 350
 351 inline void assertv4(const al::span<const float,4> v_f [[maybe_unused]],
 352     const float f0 [[maybe_unused]], const float f1 [[maybe_unused]],
 353     const float f2 [[maybe_unused]], const float f3 [[maybe_unused]])
 354 { assert(v_f[0] == f0 && v_f[1] == f1 && v_f[2] == f2 && v_f[3] == f3); }
 355
 356 template<typename T, T ...N>
 357 constexpr auto make_float_array(std::integer_sequence<T,N...>)
 358 { return std::array{static_cast<float>(N)...}; }
 359
 360 /* detect bugs with the vector support macros */
 361 [[maybe_unused]] void validate_pffft_simd()
 362 {
 363     using float4 = std::array<float,4>;
 364     static constexpr auto f = make_float_array(std::make_index_sequence<16>{});
 365
 366     v4sf a0_v{vset4(f[ 0], f[ 1], f[ 2], f[ 3])};
 367     v4sf a1_v{vset4(f[ 4], f[ 5], f[ 6], f[ 7])};
 368     v4sf a2_v{vset4(f[ 8], f[ 9], f[10], f[11])};
 369     v4sf a3_v{vset4(f[12], f[13], f[14], f[15])};
 370     v4sf u_v{};
 371
 372     auto t_v = vzero();
 373     auto t_f = al::bit_cast<float4>(t_v);
 374     printf("VZERO=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
 375     assertv4(t_f, 0, 0, 0, 0);
 376
 377     t_v = vadd(a1_v, a2_v);
 378     t_f = al::bit_cast<float4>(t_v);
 379     printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
 380     assertv4(t_f, 12, 14, 16, 18);
 381
 382     t_v = vmul(a1_v, a2_v);
 383     t_f = al::bit_cast<float4>(t_v);
 384     printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
 385     assertv4(t_f, 32, 45, 60, 77);
 386
 387     t_v = vmadd(a1_v, a2_v, a0_v);
 388     t_f = al::bit_cast<float4>(t_v);
 389     printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
 390     assertv4(t_f, 32, 46, 62, 80);
 391
 392     interleave2(a1_v, a2_v, t_v, u_v);
 393     t_f = al::bit_cast<float4>(t_v);
 394     auto u_f = al::bit_cast<float4>(u_v);
 395     printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
 396         t_f[0], t_f[1], t_f[2], t_f[3], u_f[0], u_f[1], u_f[2], u_f[3]);
 397     assertv4(t_f, 4, 8, 5, 9);
 398     assertv4(u_f, 6, 10, 7, 11);
 399
 400     uninterleave2(a1_v, a2_v, t_v, u_v);
 401     t_f = al::bit_cast<float4>(t_v);
 402     u_f = al::bit_cast<float4>(u_v);
 403     printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
 404         t_f[0], t_f[1], t_f[2], t_f[3], u_f[0], u_f[1], u_f[2], u_f[3]);
 405     assertv4(t_f, 4, 6, 8, 10);
 406     assertv4(u_f, 5, 7, 9, 11);
 407
 408     t_v = ld_ps1(f[15]);
 409     t_f = al::bit_cast<float4>(t_v);
 410     printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
 411     assertv4(t_f, 15, 15, 15, 15);
 412
 413     t_v = vswaphl(a1_v, a2_v);
 414     t_f = al::bit_cast<float4>(t_v);
 415     printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
 416     assertv4(t_f, 8, 9, 6, 7);
 417
 418     vtranspose4(a0_v, a1_v, a2_v, a3_v);
 419     auto a0_f = al::bit_cast<float4>(a0_v);
 420     auto a1_f = al::bit_cast<float4>(a1_v);
 421     auto a2_f = al::bit_cast<float4>(a2_v);
 422     auto a3_f = al::bit_cast<float4>(a3_v);
 423     printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
 424           a0_f[0], a0_f[1], a0_f[2], a0_f[3], a1_f[0], a1_f[1], a1_f[2], a1_f[3],
 425           a2_f[0], a2_f[1], a2_f[2], a2_f[3], a3_f[0], a3_f[1], a3_f[2], a3_f[3]);
 426     assertv4(a0_f, 0, 4, 8, 12);
 427     assertv4(a1_f, 1, 5, 9, 13);
 428     assertv4(a2_f, 2, 6, 10, 14);
 429     assertv4(a3_f, 3, 7, 11, 15);
 430 }
 431 #endif //!PFFFT_SIMD_DISABLE
 432
 433 /* SSE and co like 16-bytes aligned pointers */
 434 /* with a 64-byte alignment, we are even aligned on L2 cache lines... */
 435 constexpr auto V4sfAlignment = size_t(64);
 436 constexpr auto V4sfAlignVal = std::align_val_t(V4sfAlignment);
 437
 438 /* NOLINTBEGIN(cppcoreguidelines-pro-bounds-pointer-arithmetic)
 439  * FIXME: Converting this from raw pointers to spans or something will probably
 440  * need significant work to maintain performance, given non-sequential range-
 441  * checked accesses and lack of 'restrict' to indicate non-aliased memory. At
 442  * least, some tests should be done to check the impact of using range-checked
 443  * spans here before blindly switching.
 444  */
 445 /*
 446   passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
 447 */
 448 NOINLINE void passf2_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
 449     const float *const wa1, const float fsign)
 450 {
 451     const size_t l1ido{l1*ido};
 452     if(ido <= 2)
 453     {
 454         for(size_t k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido)
 455         {
 456             ch[0]         = vadd(cc[0], cc[ido+0]);
 457             ch[l1ido]     = vsub(cc[0], cc[ido+0]);
 458             ch[1]         = vadd(cc[1], cc[ido+1]);
 459             ch[l1ido + 1] = vsub(cc[1], cc[ido+1]);
 460         }
 461     }
 462     else
 463     {
 464         for(size_t k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido)
 465         {
 466             for(size_t i{0};i < ido-1;i += 2)
 467             {
 468                 v4sf tr2{vsub(cc[i+0], cc[i+ido+0])};
 469                 v4sf ti2{vsub(cc[i+1], cc[i+ido+1])};
 470                 v4sf wr{ld_ps1(wa1[i])}, wi{ld_ps1(wa1[i+1]*fsign)};
 471                 ch[i]   = vadd(cc[i+0], cc[i+ido+0]);
 472                 ch[i+1] = vadd(cc[i+1], cc[i+ido+1]);
 473                 vcplxmul(tr2, ti2, wr, wi);
 474                 ch[i+l1ido]   = tr2;
 475                 ch[i+l1ido+1] = ti2;
 476             }
 477         }
 478     }
 479 }
 480
 481 /*
 482   passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
 483 */
 484 NOINLINE void passf3_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
 485     const float *const wa1, const float fsign)
 486 {
 487     assert(ido > 2);
 488
 489     const v4sf taur{ld_ps1(-0.5f)};
 490     const v4sf taui{ld_ps1(0.866025403784439f*fsign)};
 491     const size_t l1ido{l1*ido};
 492     const auto wa2 = wa1 + ido;
 493     for(size_t k{0};k < l1ido;k += ido, cc += 3*ido, ch +=ido)
 494     {
 495         for(size_t i{0};i < ido-1;i += 2)
 496         {
 497             v4sf tr2{vadd(cc[i+ido], cc[i+2*ido])};
 498             v4sf cr2{vmadd(taur, tr2, cc[i])};
 499             ch[i]  = vadd(tr2, cc[i]);
 500             v4sf ti2{vadd(cc[i+ido+1], cc[i+2*ido+1])};
 501             v4sf ci2{vmadd(taur, ti2, cc[i+1])};
 502             ch[i+1] = vadd(cc[i+1], ti2);
 503             v4sf cr3{vmul(taui, vsub(cc[i+ido], cc[i+2*ido]))};
 504             v4sf ci3{vmul(taui, vsub(cc[i+ido+1], cc[i+2*ido+1]))};
 505             v4sf dr2{vsub(cr2, ci3)};
 506             v4sf dr3{vadd(cr2, ci3)};
 507             v4sf di2{vadd(ci2, cr3)};
 508             v4sf di3{vsub(ci2, cr3)};
 509             float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
 510             vcplxmul(dr2, di2, ld_ps1(wr1), ld_ps1(wi1));
 511             ch[i+l1ido] = dr2;
 512             ch[i+l1ido + 1] = di2;
 513             vcplxmul(dr3, di3, ld_ps1(wr2), ld_ps1(wi2));
 514             ch[i+2*l1ido] = dr3;
 515             ch[i+2*l1ido+1] = di3;
 516         }
 517     }
 518 } /* passf3 */
 519
 520 NOINLINE void passf4_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
 521     const float *const wa1, const float fsign)
 522 {
 523     /* fsign == -1 for forward transform and +1 for backward transform */
 524     const v4sf vsign{ld_ps1(fsign)};
 525     const size_t l1ido{l1*ido};
 526     if(ido == 2)
 527     {
 528         for(size_t k{0};k < l1ido;k += ido, ch += ido, cc += 4*ido)
 529         {
 530             v4sf tr1{vsub(cc[0], cc[2*ido + 0])};
 531             v4sf tr2{vadd(cc[0], cc[2*ido + 0])};
 532             v4sf ti1{vsub(cc[1], cc[2*ido + 1])};
 533             v4sf ti2{vadd(cc[1], cc[2*ido + 1])};
 534             v4sf ti4{vmul(vsub(cc[1*ido + 0], cc[3*ido + 0]), vsign)};
 535             v4sf tr4{vmul(vsub(cc[3*ido + 1], cc[1*ido + 1]), vsign)};
 536             v4sf tr3{vadd(cc[ido + 0], cc[3*ido + 0])};
 537             v4sf ti3{vadd(cc[ido + 1], cc[3*ido + 1])};
 538
 539             ch[0*l1ido + 0] = vadd(tr2, tr3);
 540             ch[0*l1ido + 1] = vadd(ti2, ti3);
 541             ch[1*l1ido + 0] = vadd(tr1, tr4);
 542             ch[1*l1ido + 1] = vadd(ti1, ti4);
 543             ch[2*l1ido + 0] = vsub(tr2, tr3);
 544             ch[2*l1ido + 1] = vsub(ti2, ti3);
 545             ch[3*l1ido + 0] = vsub(tr1, tr4);
 546             ch[3*l1ido + 1] = vsub(ti1, ti4);
 547         }
 548     }
 549     else
 550     {
 551         const auto wa2 = wa1 + ido;
 552         const auto wa3 = wa2 + ido;
 553         for(size_t k{0};k < l1ido;k += ido, ch+=ido, cc += 4*ido)
 554         {
 555             for(size_t i{0};i < ido-1;i+=2)
 556             {
 557                 v4sf tr1{vsub(cc[i + 0], cc[i + 2*ido + 0])};
 558                 v4sf tr2{vadd(cc[i + 0], cc[i + 2*ido + 0])};
 559                 v4sf ti1{vsub(cc[i + 1], cc[i + 2*ido + 1])};
 560                 v4sf ti2{vadd(cc[i + 1], cc[i + 2*ido + 1])};
 561                 v4sf tr4{vmul(vsub(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), vsign)};
 562                 v4sf ti4{vmul(vsub(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), vsign)};
 563                 v4sf tr3{vadd(cc[i + ido + 0], cc[i + 3*ido + 0])};
 564                 v4sf ti3{vadd(cc[i + ido + 1], cc[i + 3*ido + 1])};
 565
 566                 ch[i] = vadd(tr2, tr3);
 567                 v4sf cr3{vsub(tr2, tr3)};
 568                 ch[i + 1] = vadd(ti2, ti3);
 569                 v4sf ci3{vsub(ti2, ti3)};
 570
 571                 v4sf cr2{vadd(tr1, tr4)};
 572                 v4sf cr4{vsub(tr1, tr4)};
 573                 v4sf ci2{vadd(ti1, ti4)};
 574                 v4sf ci4{vsub(ti1, ti4)};
 575                 float wr1{wa1[i]}, wi1{fsign*wa1[i+1]};
 576                 vcplxmul(cr2, ci2, ld_ps1(wr1), ld_ps1(wi1));
 577                 float wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
 578                 ch[i + l1ido] = cr2;
 579                 ch[i + l1ido + 1] = ci2;
 580
 581                 vcplxmul(cr3, ci3, ld_ps1(wr2), ld_ps1(wi2));
 582                 float wr3{wa3[i]}, wi3{fsign*wa3[i+1]};
 583                 ch[i + 2*l1ido] = cr3;
 584                 ch[i + 2*l1ido + 1] = ci3;
 585
 586                 vcplxmul(cr4, ci4, ld_ps1(wr3), ld_ps1(wi3));
 587                 ch[i + 3*l1ido] = cr4;
 588                 ch[i + 3*l1ido + 1] = ci4;
 589             }
 590         }
 591     }
 592 } /* passf4 */
 593
 594 /*
 595  * passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
 596  */
 597 NOINLINE void passf5_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
 598     const float *const wa1, const float fsign)
 599 {
 600     const v4sf tr11{ld_ps1(0.309016994374947f)};
 601     const v4sf tr12{ld_ps1(-0.809016994374947f)};
 602     const v4sf ti11{ld_ps1(0.951056516295154f*fsign)};
 603     const v4sf ti12{ld_ps1(0.587785252292473f*fsign)};
 604
 605     auto cc_ref = [&cc,ido](size_t a_1, size_t a_2) noexcept -> auto&
 606     { return cc[(a_2-1)*ido + a_1 + 1]; };
 607     auto ch_ref = [&ch,ido,l1](size_t a_1, size_t a_3) noexcept -> auto&
 608     { return ch[(a_3-1)*l1*ido + a_1 + 1]; };
 609
 610     assert(ido > 2);
 611
 612     const auto wa2 = wa1 + ido;
 613     const auto wa3 = wa2 + ido;
 614     const auto wa4 = wa3 + ido;
 615     for(size_t k{0};k < l1;++k, cc += 5*ido, ch += ido)
 616     {
 617         for(size_t i{0};i < ido-1;i += 2)
 618         {
 619             v4sf ti5{vsub(cc_ref(i  , 2), cc_ref(i  , 5))};
 620             v4sf ti2{vadd(cc_ref(i  , 2), cc_ref(i  , 5))};
 621             v4sf ti4{vsub(cc_ref(i  , 3), cc_ref(i  , 4))};
 622             v4sf ti3{vadd(cc_ref(i  , 3), cc_ref(i  , 4))};
 623             v4sf tr5{vsub(cc_ref(i-1, 2), cc_ref(i-1, 5))};
 624             v4sf tr2{vadd(cc_ref(i-1, 2), cc_ref(i-1, 5))};
 625             v4sf tr4{vsub(cc_ref(i-1, 3), cc_ref(i-1, 4))};
 626             v4sf tr3{vadd(cc_ref(i-1, 3), cc_ref(i-1, 4))};
 627             ch_ref(i-1, 1) = vadd(cc_ref(i-1, 1), vadd(tr2, tr3));
 628             ch_ref(i  , 1) = vadd(cc_ref(i  , 1), vadd(ti2, ti3));
 629             v4sf cr2{vadd(cc_ref(i-1, 1), vmadd(tr11, tr2, vmul(tr12, tr3)))};
 630             v4sf ci2{vadd(cc_ref(i  , 1), vmadd(tr11, ti2, vmul(tr12, ti3)))};
 631             v4sf cr3{vadd(cc_ref(i-1, 1), vmadd(tr12, tr2, vmul(tr11, tr3)))};
 632             v4sf ci3{vadd(cc_ref(i  , 1), vmadd(tr12, ti2, vmul(tr11, ti3)))};
 633             v4sf cr5{vmadd(ti11, tr5, vmul(ti12, tr4))};
 634             v4sf ci5{vmadd(ti11, ti5, vmul(ti12, ti4))};
 635             v4sf cr4{vsub(vmul(ti12, tr5), vmul(ti11, tr4))};
 636             v4sf ci4{vsub(vmul(ti12, ti5), vmul(ti11, ti4))};
 637             v4sf dr3{vsub(cr3, ci4)};
 638             v4sf dr4{vadd(cr3, ci4)};
 639             v4sf di3{vadd(ci3, cr4)};
 640             v4sf di4{vsub(ci3, cr4)};
 641             v4sf dr5{vadd(cr2, ci5)};
 642             v4sf dr2{vsub(cr2, ci5)};
 643             v4sf di5{vsub(ci2, cr5)};
 644             v4sf di2{vadd(ci2, cr5)};
 645             float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
 646             float wr3{wa3[i]}, wi3{fsign*wa3[i+1]}, wr4{wa4[i]}, wi4{fsign*wa4[i+1]};
 647             vcplxmul(dr2, di2, ld_ps1(wr1), ld_ps1(wi1));
 648             ch_ref(i - 1, 2) = dr2;
 649             ch_ref(i, 2)     = di2;
 650             vcplxmul(dr3, di3, ld_ps1(wr2), ld_ps1(wi2));
 651             ch_ref(i - 1, 3) = dr3;
 652             ch_ref(i, 3)     = di3;
 653             vcplxmul(dr4, di4, ld_ps1(wr3), ld_ps1(wi3));
 654             ch_ref(i - 1, 4) = dr4;
 655             ch_ref(i, 4)     = di4;
 656             vcplxmul(dr5, di5, ld_ps1(wr4), ld_ps1(wi4));
 657             ch_ref(i - 1, 5) = dr5;
 658             ch_ref(i, 5)     = di5;
 659         }
 660     }
 661 }
 662
 663 NOINLINE void radf2_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
 664     v4sf *RESTRICT ch, const float *const wa1)
 665 {
 666     const size_t l1ido{l1*ido};
 667     for(size_t k{0};k < l1ido;k += ido)
 668     {
 669         v4sf a{cc[k]}, b{cc[k + l1ido]};
 670         ch[2*k] = vadd(a, b);
 671         ch[2*(k+ido)-1] = vsub(a, b);
 672     }
 673     if(ido < 2)
 674         return;
 675     if(ido != 2)
 676     {
 677         for(size_t k{0};k < l1ido;k += ido)
 678         {
 679             for(size_t i{2};i < ido;i += 2)
 680             {
 681                 v4sf tr2{cc[i - 1 + k + l1ido]}, ti2{cc[i + k + l1ido]};
 682                 v4sf br{cc[i - 1 + k]}, bi{cc[i + k]};
 683                 vcplxmulconj(tr2, ti2, ld_ps1(wa1[i - 2]), ld_ps1(wa1[i - 1]));
 684                 ch[i + 2*k] = vadd(bi, ti2);
 685                 ch[2*(k+ido) - i] = vsub(ti2, bi);
 686                 ch[i - 1 + 2*k] = vadd(br, tr2);
 687                 ch[2*(k+ido) - i -1] = vsub(br, tr2);
 688             }
 689         }
 690         if((ido&1) == 1)
 691             return;
 692     }
 693     const v4sf minus_one{ld_ps1(-1.0f)};
 694     for(size_t k{0};k < l1ido;k += ido)
 695     {
 696         ch[2*k + ido] = vmul(minus_one, cc[ido-1 + k + l1ido]);
 697         ch[2*k + ido-1] = cc[k + ido-1];
 698     }
 699 } /* radf2 */
 700
 701
 702 NOINLINE void radb2_ps(const size_t ido, const size_t l1, const v4sf *cc, v4sf *RESTRICT ch,
 703     const float *const wa1)
 704 {
 705     const size_t l1ido{l1*ido};
 706     for(size_t k{0};k < l1ido;k += ido)
 707     {
 708         v4sf a{cc[2*k]};
 709         v4sf b{cc[2*(k+ido) - 1]};
 710         ch[k] = vadd(a, b);
 711         ch[k + l1ido] = vsub(a, b);
 712     }
 713     if(ido < 2)
 714         return;
 715     if(ido != 2)
 716     {
 717         for(size_t k{0};k < l1ido;k += ido)
 718         {
 719             for(size_t i{2};i < ido;i += 2)
 720             {
 721                 v4sf a{cc[i-1 + 2*k]};
 722                 v4sf b{cc[2*(k + ido) - i - 1]};
 723                 v4sf c{cc[i+0 + 2*k]};
 724                 v4sf d{cc[2*(k + ido) - i + 0]};
 725                 ch[i-1 + k] = vadd(a, b);
 726                 v4sf tr2{vsub(a, b)};
 727                 ch[i+0 + k] = vsub(c, d);
 728                 v4sf ti2{vadd(c, d)};
 729                 vcplxmul(tr2, ti2, ld_ps1(wa1[i - 2]), ld_ps1(wa1[i - 1]));
 730                 ch[i-1 + k + l1ido] = tr2;
 731                 ch[i+0 + k + l1ido] = ti2;
 732             }
 733         }
 734         if((ido&1) == 1)
 735             return;
 736     }
 737     const v4sf minus_two{ld_ps1(-2.0f)};
 738     for(size_t k{0};k < l1ido;k += ido)
 739     {
 740         v4sf a{cc[2*k + ido-1]};
 741         v4sf b{cc[2*k + ido]};
 742         ch[k + ido-1] = vadd(a,a);
 743         ch[k + ido-1 + l1ido] = vmul(minus_two, b);
 744     }
 745 } /* radb2 */
 746
 747 void radf3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
 748     const float *const wa1)
 749 {
 750     const v4sf taur{ld_ps1(-0.5f)};
 751     const v4sf taui{ld_ps1(0.866025403784439f)};
 752     for(size_t k{0};k < l1;++k)
 753     {
 754         v4sf cr2{vadd(cc[(k + l1)*ido], cc[(k + 2*l1)*ido])};
 755         ch[        (3*k    )*ido] = vadd(cc[k*ido], cr2);
 756         ch[        (3*k + 2)*ido] = vmul(taui, vsub(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
 757         ch[ido-1 + (3*k + 1)*ido] = vmadd(taur, cr2, cc[k*ido]);
 758     }
 759     if(ido == 1)
 760         return;
 761
 762     const auto wa2 = wa1 + ido;
 763     for(size_t k{0};k < l1;++k)
 764     {
 765         for(size_t i{2};i < ido;i += 2)
 766         {
 767             const size_t ic{ido - i};
 768             v4sf wr1{ld_ps1(wa1[i - 2])};
 769             v4sf wi1{ld_ps1(wa1[i - 1])};
 770             v4sf dr2{cc[i - 1 + (k + l1)*ido]};
 771             v4sf di2{cc[i     + (k + l1)*ido]};
 772             vcplxmulconj(dr2, di2, wr1, wi1);
 773
 774             v4sf wr2{ld_ps1(wa2[i - 2])};
 775             v4sf wi2{ld_ps1(wa2[i - 1])};
 776             v4sf dr3{cc[i - 1 + (k + l1*2)*ido]};
 777             v4sf di3{cc[i     + (k + l1*2)*ido]};
 778             vcplxmulconj(dr3, di3, wr2, wi2);
 779
 780             v4sf cr2{vadd(dr2, dr3)};
 781             v4sf ci2{vadd(di2, di3)};
 782             ch[i - 1 + 3*k*ido] = vadd(cc[i - 1 + k*ido], cr2);
 783             ch[i     + 3*k*ido] = vadd(cc[i     + k*ido], ci2);
 784             v4sf tr2{vmadd(taur, cr2, cc[i - 1 + k*ido])};
 785             v4sf ti2{vmadd(taur, ci2, cc[i     + k*ido])};
 786             v4sf tr3{vmul(taui, vsub(di2, di3))};
 787             v4sf ti3{vmul(taui, vsub(dr3, dr2))};
 788             ch[i  - 1 + (3*k + 2)*ido] = vadd(tr2, tr3);
 789             ch[ic - 1 + (3*k + 1)*ido] = vsub(tr2, tr3);
 790             ch[i      + (3*k + 2)*ido] = vadd(ti2, ti3);
 791             ch[ic     + (3*k + 1)*ido] = vsub(ti3, ti2);
 792         }
 793     }
 794 } /* radf3 */
 795
 796
 797 void radb3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
 798     const float *const wa1)
 799 {
 800     static constexpr float taur{-0.5f};
 801     static constexpr float taui{0.866025403784439f};
 802     static constexpr float taui_2{taui*2.0f};
 803
 804     const v4sf vtaur{ld_ps1(taur)};
 805     const v4sf vtaui_2{ld_ps1(taui_2)};
 806     for(size_t k{0};k < l1;++k)
 807     {
 808         v4sf tr2 = cc[ido-1 + (3*k + 1)*ido];
 809         tr2 = vadd(tr2,tr2);
 810         v4sf cr2 = vmadd(vtaur, tr2, cc[3*k*ido]);
 811         ch[k*ido] = vadd(cc[3*k*ido], tr2);
 812         v4sf ci3 = vmul(vtaui_2, cc[(3*k + 2)*ido]);
 813         ch[(k + l1)*ido] = vsub(cr2, ci3);
 814         ch[(k + 2*l1)*ido] = vadd(cr2, ci3);
 815     }
 816     if(ido == 1)
 817         return;
 818
 819     const auto wa2 = wa1 + ido;
 820     const v4sf vtaui{ld_ps1(taui)};
 821     for(size_t k{0};k < l1;++k)
 822     {
 823         for(size_t i{2};i < ido;i += 2)
 824         {
 825             const size_t ic{ido - i};
 826             v4sf tr2{vadd(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido])};
 827             v4sf cr2{vmadd(vtaur, tr2, cc[i - 1 + 3*k*ido])};
 828             ch[i - 1 + k*ido] = vadd(cc[i - 1 + 3*k*ido], tr2);
 829             v4sf ti2{vsub(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido])};
 830             v4sf ci2{vmadd(vtaur, ti2, cc[i + 3*k*ido])};
 831             ch[i + k*ido] = vadd(cc[i + 3*k*ido], ti2);
 832             v4sf cr3{vmul(vtaui, vsub(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]))};
 833             v4sf ci3{vmul(vtaui, vadd(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]))};
 834             v4sf dr2{vsub(cr2, ci3)};
 835             v4sf dr3{vadd(cr2, ci3)};
 836             v4sf di2{vadd(ci2, cr3)};
 837             v4sf di3{vsub(ci2, cr3)};
 838             vcplxmul(dr2, di2, ld_ps1(wa1[i-2]), ld_ps1(wa1[i-1]));
 839             ch[i - 1 + (k + l1)*ido] = dr2;
 840             ch[i + (k + l1)*ido] = di2;
 841             vcplxmul(dr3, di3, ld_ps1(wa2[i-2]), ld_ps1(wa2[i-1]));
 842             ch[i - 1 + (k + 2*l1)*ido] = dr3;
 843             ch[i + (k + 2*l1)*ido] = di3;
 844         }
 845     }
 846 } /* radb3 */
 847
 848 NOINLINE void radf4_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
 849     v4sf *RESTRICT ch, const float *const wa1)
 850 {
 851     const size_t l1ido{l1*ido};
 852     {
 853         const v4sf *RESTRICT cc_{cc}, *RESTRICT cc_end{cc + l1ido};
 854         v4sf *RESTRICT ch_{ch};
 855         while(cc != cc_end)
 856         {
 857             // this loop represents between 25% and 40% of total radf4_ps cost !
 858             v4sf a0{cc[0]}, a1{cc[l1ido]};
 859             v4sf a2{cc[2*l1ido]}, a3{cc[3*l1ido]};
 860             v4sf tr1{vadd(a1, a3)};
 861             v4sf tr2{vadd(a0, a2)};
 862             ch[2*ido-1] = vsub(a0, a2);
 863             ch[2*ido  ] = vsub(a3, a1);
 864             ch[0      ] = vadd(tr1, tr2);
 865             ch[4*ido-1] = vsub(tr2, tr1);
 866             cc += ido; ch += 4*ido;
 867         }
 868         cc = cc_;
 869         ch = ch_;
 870     }
 871     if(ido < 2)
 872         return;
 873     if(ido != 2)
 874     {
 875         const auto wa2 = wa1 + ido;
 876         const auto wa3 = wa2 + ido;
 877
 878         for(size_t k{0};k < l1ido;k += ido)
 879         {
 880             const v4sf *RESTRICT pc{cc + 1 + k};
 881             for(size_t i{2};i < ido;i += 2, pc += 2)
 882             {
 883                 const size_t ic{ido - i};
 884
 885                 v4sf cr2{pc[1*l1ido+0]};
 886                 v4sf ci2{pc[1*l1ido+1]};
 887                 v4sf wr{ld_ps1(wa1[i - 2])};
 888                 v4sf wi{ld_ps1(wa1[i - 1])};
 889                 vcplxmulconj(cr2,ci2,wr,wi);
 890
 891                 v4sf cr3{pc[2*l1ido+0]};
 892                 v4sf ci3{pc[2*l1ido+1]};
 893                 wr = ld_ps1(wa2[i-2]);
 894                 wi = ld_ps1(wa2[i-1]);
 895                 vcplxmulconj(cr3, ci3, wr, wi);
 896
 897                 v4sf cr4{pc[3*l1ido]};
 898                 v4sf ci4{pc[3*l1ido+1]};
 899                 wr = ld_ps1(wa3[i-2]);
 900                 wi = ld_ps1(wa3[i-1]);
 901                 vcplxmulconj(cr4, ci4, wr, wi);
 902
 903                 /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
 904
 905                 v4sf tr1{vadd(cr2,cr4)};
 906                 v4sf tr4{vsub(cr4,cr2)};
 907                 v4sf tr2{vadd(pc[0],cr3)};
 908                 v4sf tr3{vsub(pc[0],cr3)};
 909                 ch[i  - 1 + 4*k        ] = vadd(tr2,tr1);
 910                 ch[ic - 1 + 4*k + 3*ido] = vsub(tr2,tr1); // at this point tr1 and tr2 can be disposed
 911                 v4sf ti1{vadd(ci2,ci4)};
 912                 v4sf ti4{vsub(ci2,ci4)};
 913                 ch[i  - 1 + 4*k + 2*ido] = vadd(tr3,ti4);
 914                 ch[ic - 1 + 4*k + 1*ido] = vsub(tr3,ti4); // dispose tr3, ti4
 915                 v4sf ti2{vadd(pc[1],ci3)};
 916                 v4sf ti3{vsub(pc[1],ci3)};
 917                 ch[i  + 4*k        ] = vadd(ti1, ti2);
 918                 ch[ic + 4*k + 3*ido] = vsub(ti1, ti2);
 919                 ch[i  + 4*k + 2*ido] = vadd(tr4, ti3);
 920                 ch[ic + 4*k + 1*ido] = vsub(tr4, ti3);
 921             }
 922         }
 923         if((ido&1) == 1)
 924             return;
 925     }
 926     const v4sf minus_hsqt2{ld_ps1(al::numbers::sqrt2_v<float> * -0.5f)};
 927     for(size_t k{0};k < l1ido;k += ido)
 928     {
 929         v4sf a{cc[ido-1 + k + l1ido]}, b{cc[ido-1 + k + 3*l1ido]};
 930         v4sf c{cc[ido-1 + k]}, d{cc[ido-1 + k + 2*l1ido]};
 931         v4sf ti1{vmul(minus_hsqt2, vadd(b, a))};
 932         v4sf tr1{vmul(minus_hsqt2, vsub(b, a))};
 933         ch[ido-1 + 4*k        ] = vadd(c, tr1);
 934         ch[ido-1 + 4*k + 2*ido] = vsub(c, tr1);
 935         ch[        4*k + 1*ido] = vsub(ti1, d);
 936         ch[        4*k + 3*ido] = vadd(ti1, d);
 937     }
 938 } /* radf4 */
 939
 940
 941 NOINLINE void radb4_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc,
 942     v4sf *RESTRICT ch, const float *const wa1)
 943 {
 944     const v4sf two{ld_ps1(2.0f)};
 945     const size_t l1ido{l1*ido};
 946     {
 947         const v4sf *RESTRICT cc_{cc}, *RESTRICT ch_end{ch + l1ido};
 948         v4sf *ch_{ch};
 949         while(ch != ch_end)
 950         {
 951             v4sf a{cc[0]}, b{cc[4*ido-1]};
 952             v4sf c{cc[2*ido]}, d{cc[2*ido-1]};
 953             v4sf tr3{vmul(two,d)};
 954             v4sf tr2{vadd(a,b)};
 955             v4sf tr1{vsub(a,b)};
 956             v4sf tr4{vmul(two,c)};
 957             ch[0*l1ido] = vadd(tr2, tr3);
 958             ch[2*l1ido] = vsub(tr2, tr3);
 959             ch[1*l1ido] = vsub(tr1, tr4);
 960             ch[3*l1ido] = vadd(tr1, tr4);
 961
 962             cc += 4*ido; ch += ido;
 963         }
 964         cc = cc_; ch = ch_;
 965     }
 966     if(ido < 2)
 967         return;
 968     if(ido != 2)
 969     {
 970         const auto wa2 = wa1 + ido;
 971         const auto wa3 = wa2 + ido;
 972
 973         for(size_t k{0};k < l1ido;k += ido)
 974         {
 975             const v4sf *RESTRICT pc{cc - 1 + 4*k};
 976             v4sf *RESTRICT ph{ch + k + 1};
 977             for(size_t i{2};i < ido;i += 2)
 978             {
 979                 v4sf tr1{vsub(pc[        i], pc[4*ido - i])};
 980                 v4sf tr2{vadd(pc[        i], pc[4*ido - i])};
 981                 v4sf ti4{vsub(pc[2*ido + i], pc[2*ido - i])};
 982                 v4sf tr3{vadd(pc[2*ido + i], pc[2*ido - i])};
 983                 ph[0] = vadd(tr2, tr3);
 984                 v4sf cr3{vsub(tr2, tr3)};
 985
 986                 v4sf ti3{vsub(pc[2*ido + i + 1], pc[2*ido - i + 1])};
 987                 v4sf tr4{vadd(pc[2*ido + i + 1], pc[2*ido - i + 1])};
 988                 v4sf cr2{vsub(tr1, tr4)};
 989                 v4sf cr4{vadd(tr1, tr4)};
 990
 991                 v4sf ti1{vadd(pc[i + 1], pc[4*ido - i + 1])};
 992                 v4sf ti2{vsub(pc[i + 1], pc[4*ido - i + 1])};
 993
 994                 ph[1] = vadd(ti2, ti3); ph += l1ido;
 995                 v4sf ci3{vsub(ti2, ti3)};
 996                 v4sf ci2{vadd(ti1, ti4)};
 997                 v4sf ci4{vsub(ti1, ti4)};
 998                 vcplxmul(cr2, ci2, ld_ps1(wa1[i-2]), ld_ps1(wa1[i-1]));
 999                 ph[0] = cr2;
1000                 ph[1] = ci2; ph += l1ido;
1001                 vcplxmul(cr3, ci3, ld_ps1(wa2[i-2]), ld_ps1(wa2[i-1]));
1002                 ph[0] = cr3;
1003                 ph[1] = ci3; ph += l1ido;
1004                 vcplxmul(cr4, ci4, ld_ps1(wa3[i-2]), ld_ps1(wa3[i-1]));
1005                 ph[0] = cr4;
1006                 ph[1] = ci4; ph = ph - 3*l1ido + 2;
1007             }
1008         }
1009         if((ido&1) == 1)
1010             return;
1011     }
1012     const v4sf minus_sqrt2{ld_ps1(-1.414213562373095f)};
1013     for(size_t k{0};k < l1ido;k += ido)
1014     {
1015         const size_t i0{4*k + ido};
1016         v4sf c{cc[i0-1]}, d{cc[i0 + 2*ido-1]};
1017         v4sf a{cc[i0+0]}, b{cc[i0 + 2*ido+0]};
1018         v4sf tr1{vsub(c,d)};
1019         v4sf tr2{vadd(c,d)};
1020         v4sf ti1{vadd(b,a)};
1021         v4sf ti2{vsub(b,a)};
1022         ch[ido-1 + k + 0*l1ido] = vadd(tr2,tr2);
1023         ch[ido-1 + k + 1*l1ido] = vmul(minus_sqrt2, vsub(ti1, tr1));
1024         ch[ido-1 + k + 2*l1ido] = vadd(ti2, ti2);
1025         ch[ido-1 + k + 3*l1ido] = vmul(minus_sqrt2, vadd(ti1, tr1));
1026     }
1027 } /* radb4 */
1028
1029 void radf5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
1030     const float *const wa1)
1031 {
1032     const v4sf tr11{ld_ps1(0.309016994374947f)};
1033     const v4sf ti11{ld_ps1(0.951056516295154f)};
1034     const v4sf tr12{ld_ps1(-0.809016994374947f)};
1035     const v4sf ti12{ld_ps1(0.587785252292473f)};
1036
1037     auto cc_ref = [&cc,l1,ido](size_t a_1, size_t a_2, size_t a_3) noexcept -> auto&
1038     { return cc[(a_3*l1 + a_2)*ido + a_1]; };
1039     auto ch_ref = [&ch,ido](size_t a_1, size_t a_2, size_t a_3) noexcept -> auto&
1040     { return ch[(a_3*5 + a_2)*ido + a_1]; };
1041
1042     /* Parameter adjustments */
1043     ch -= 1 + ido * 6;
1044     cc -= 1 + ido * (1 + l1);
1045
1046     const auto wa2 = wa1 + ido;
1047     const auto wa3 = wa2 + ido;
1048     const auto wa4 = wa3 + ido;
1049
1050     /* Function Body */
1051     for(size_t k{1};k <= l1;++k)
1052     {
1053         v4sf cr2{vadd(cc_ref(1, k, 5), cc_ref(1, k, 2))};
1054         v4sf ci5{vsub(cc_ref(1, k, 5), cc_ref(1, k, 2))};
1055         v4sf cr3{vadd(cc_ref(1, k, 4), cc_ref(1, k, 3))};
1056         v4sf ci4{vsub(cc_ref(1, k, 4), cc_ref(1, k, 3))};
1057         ch_ref(1, 1, k)   = vadd(cc_ref(1, k, 1), vadd(cr2, cr3));
1058         ch_ref(ido, 2, k) = vadd(cc_ref(1, k, 1), vmadd(tr11, cr2, vmul(tr12, cr3)));
1059         ch_ref(1, 3, k)   = vmadd(ti11, ci5, vmul(ti12, ci4));
1060         ch_ref(ido, 4, k) = vadd(cc_ref(1, k, 1), vmadd(tr12, cr2, vmul(tr11, cr3)));
1061         ch_ref(1, 5, k)   = vsub(vmul(ti12, ci5), vmul(ti11, ci4));
1062         //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
1063     }
1064     if(ido == 1)
1065         return;
1066
1067     const size_t idp2{ido + 2};
1068     for(size_t k{1};k <= l1;++k)
1069     {
1070         for(size_t i{3};i <= ido;i += 2)
1071         {
1072             const size_t ic{idp2 - i};
1073             v4sf dr2{ld_ps1(wa1[i-3])};
1074             v4sf di2{ld_ps1(wa1[i-2])};
1075             v4sf dr3{ld_ps1(wa2[i-3])};
1076             v4sf di3{ld_ps1(wa2[i-2])};
1077             v4sf dr4{ld_ps1(wa3[i-3])};
1078             v4sf di4{ld_ps1(wa3[i-2])};
1079             v4sf dr5{ld_ps1(wa4[i-3])};
1080             v4sf di5{ld_ps1(wa4[i-2])};
1081             vcplxmulconj(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
1082             vcplxmulconj(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
1083             vcplxmulconj(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
1084             vcplxmulconj(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
1085             v4sf cr2{vadd(dr2, dr5)};
1086             v4sf ci5{vsub(dr5, dr2)};
1087             v4sf cr5{vsub(di2, di5)};
1088             v4sf ci2{vadd(di2, di5)};
1089             v4sf cr3{vadd(dr3, dr4)};
1090             v4sf ci4{vsub(dr4, dr3)};
1091             v4sf cr4{vsub(di3, di4)};
1092             v4sf ci3{vadd(di3, di4)};
1093             ch_ref(i - 1, 1, k) = vadd(cc_ref(i - 1, k, 1), vadd(cr2, cr3));
1094             ch_ref(i, 1, k) = vsub(cc_ref(i, k, 1), vadd(ci2, ci3));
1095             v4sf tr2{vadd(cc_ref(i - 1, k, 1), vmadd(tr11, cr2, vmul(tr12, cr3)))};
1096             v4sf ti2{vsub(cc_ref(i, k, 1), vmadd(tr11, ci2, vmul(tr12, ci3)))};
1097             v4sf tr3{vadd(cc_ref(i - 1, k, 1), vmadd(tr12, cr2, vmul(tr11, cr3)))};
1098             v4sf ti3{vsub(cc_ref(i, k, 1), vmadd(tr12, ci2, vmul(tr11, ci3)))};
1099             v4sf tr5{vmadd(ti11, cr5, vmul(ti12, cr4))};
1100             v4sf ti5{vmadd(ti11, ci5, vmul(ti12, ci4))};
1101             v4sf tr4{vsub(vmul(ti12, cr5), vmul(ti11, cr4))};
1102             v4sf ti4{vsub(vmul(ti12, ci5), vmul(ti11, ci4))};
1103             ch_ref(i  - 1, 3, k) = vsub(tr2, tr5);
1104             ch_ref(ic - 1, 2, k) = vadd(tr2, tr5);
1105             ch_ref(i     , 3, k) = vadd(ti5, ti2);
1106             ch_ref(ic    , 2, k) = vsub(ti5, ti2);
1107             ch_ref(i  - 1, 5, k) = vsub(tr3, tr4);
1108             ch_ref(ic - 1, 4, k) = vadd(tr3, tr4);
1109             ch_ref(i     , 5, k) = vadd(ti4, ti3);
1110             ch_ref(ic    , 4, k) = vsub(ti4, ti3);
1111         }
1112     }
1113 } /* radf5 */
1114
1115 void radb5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
1116     const float *const wa1)
1117 {
1118     const v4sf tr11{ld_ps1(0.309016994374947f)};
1119     const v4sf ti11{ld_ps1(0.951056516295154f)};
1120     const v4sf tr12{ld_ps1(-0.809016994374947f)};
1121     const v4sf ti12{ld_ps1(0.587785252292473f)};
1122
1123     auto cc_ref = [&cc,ido](size_t a_1, size_t a_2, size_t a_3) noexcept -> auto&
1124     { return cc[(a_3*5 + a_2)*ido + a_1]; };
1125     auto ch_ref = [&ch,ido,l1](size_t a_1, size_t a_2, size_t a_3) noexcept -> auto&
1126     { return ch[(a_3*l1 + a_2)*ido + a_1]; };
1127
1128     /* Parameter adjustments */
1129     ch -= 1 + ido*(1 + l1);
1130     cc -= 1 + ido*6;
1131
1132     const auto wa2 = wa1 + ido;
1133     const auto wa3 = wa2 + ido;
1134     const auto wa4 = wa3 + ido;
1135
1136     /* Function Body */
1137     for(size_t k{1};k <= l1;++k)
1138     {
1139         v4sf ti5{vadd(cc_ref(  1, 3, k), cc_ref(1, 3, k))};
1140         v4sf ti4{vadd(cc_ref(  1, 5, k), cc_ref(1, 5, k))};
1141         v4sf tr2{vadd(cc_ref(ido, 2, k), cc_ref(ido, 2, k))};
1142         v4sf tr3{vadd(cc_ref(ido, 4, k), cc_ref(ido, 4, k))};
1143         ch_ref(1, k, 1) = vadd(cc_ref(1, 1, k), vadd(tr2, tr3));
1144         v4sf cr2{vadd(cc_ref(1, 1, k), vmadd(tr11, tr2, vmul(tr12, tr3)))};
1145         v4sf cr3{vadd(cc_ref(1, 1, k), vmadd(tr12, tr2, vmul(tr11, tr3)))};
1146         v4sf ci5{vmadd(ti11, ti5, vmul(ti12, ti4))};
1147         v4sf ci4{vsub(vmul(ti12, ti5), vmul(ti11, ti4))};
1148         ch_ref(1, k, 2) = vsub(cr2, ci5);
1149         ch_ref(1, k, 3) = vsub(cr3, ci4);
1150         ch_ref(1, k, 4) = vadd(cr3, ci4);
1151         ch_ref(1, k, 5) = vadd(cr2, ci5);
1152     }
1153     if(ido == 1)
1154         return;
1155
1156     const size_t idp2{ido + 2};
1157     for(size_t k{1};k <= l1;++k)
1158     {
1159         for(size_t i{3};i <= ido;i += 2)
1160         {
1161             const size_t ic{idp2 - i};
1162             v4sf ti5{vadd(cc_ref(i  , 3, k), cc_ref(ic  , 2, k))};
1163             v4sf ti2{vsub(cc_ref(i  , 3, k), cc_ref(ic  , 2, k))};
1164             v4sf ti4{vadd(cc_ref(i  , 5, k), cc_ref(ic  , 4, k))};
1165             v4sf ti3{vsub(cc_ref(i  , 5, k), cc_ref(ic  , 4, k))};
1166             v4sf tr5{vsub(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k))};
1167             v4sf tr2{vadd(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k))};
1168             v4sf tr4{vsub(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k))};
1169             v4sf tr3{vadd(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k))};
1170             ch_ref(i - 1, k, 1) = vadd(cc_ref(i-1, 1, k), vadd(tr2, tr3));
1171             ch_ref(i    , k, 1) = vadd(cc_ref(i  , 1, k), vadd(ti2, ti3));
1172             v4sf cr2{vadd(cc_ref(i-1, 1, k), vmadd(tr11, tr2, vmul(tr12, tr3)))};
1173             v4sf ci2{vadd(cc_ref(i  , 1, k), vmadd(tr11, ti2, vmul(tr12, ti3)))};
1174             v4sf cr3{vadd(cc_ref(i-1, 1, k), vmadd(tr12, tr2, vmul(tr11, tr3)))};
1175             v4sf ci3{vadd(cc_ref(i  , 1, k), vmadd(tr12, ti2, vmul(tr11, ti3)))};
1176             v4sf cr5{vmadd(ti11, tr5, vmul(ti12, tr4))};
1177             v4sf ci5{vmadd(ti11, ti5, vmul(ti12, ti4))};
1178             v4sf cr4{vsub(vmul(ti12, tr5), vmul(ti11, tr4))};
1179             v4sf ci4{vsub(vmul(ti12, ti5), vmul(ti11, ti4))};
1180             v4sf dr3{vsub(cr3, ci4)};
1181             v4sf dr4{vadd(cr3, ci4)};
1182             v4sf di3{vadd(ci3, cr4)};
1183             v4sf di4{vsub(ci3, cr4)};
1184             v4sf dr5{vadd(cr2, ci5)};
1185             v4sf dr2{vsub(cr2, ci5)};
1186             v4sf di5{vsub(ci2, cr5)};
1187             v4sf di2{vadd(ci2, cr5)};
1188             vcplxmul(dr2, di2, ld_ps1(wa1[i-3]), ld_ps1(wa1[i-2]));
1189             vcplxmul(dr3, di3, ld_ps1(wa2[i-3]), ld_ps1(wa2[i-2]));
1190             vcplxmul(dr4, di4, ld_ps1(wa3[i-3]), ld_ps1(wa3[i-2]));
1191             vcplxmul(dr5, di5, ld_ps1(wa4[i-3]), ld_ps1(wa4[i-2]));
1192
1193             ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
1194             ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
1195             ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
1196             ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
1197         }
1198     }
1199 } /* radb5 */
1200
1201 NOINLINE v4sf *rfftf1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1202     const float *wa, const al::span<const uint,15> ifac)
1203 {
1204     assert(work1 != work2);
1205
1206     const v4sf *in{input_readonly};
1207     v4sf *out{in == work2 ? work1 : work2};
1208     const size_t nf{ifac[1]};
1209     size_t l2{n};
1210     size_t iw{n-1};
1211     size_t k1{1};
1212     while(true)
1213     {
1214         const size_t kh{nf - k1};
1215         const size_t ip{ifac[kh + 2]};
1216         const size_t l1{l2 / ip};
1217         const size_t ido{n / l2};
1218         iw -= (ip - 1)*ido;
1219         switch(ip)
1220         {
1221         case 5:
1222             radf5_ps(ido, l1, in, out, &wa[iw]);
1223             break;
1224         case 4:
1225             radf4_ps(ido, l1, in, out, &wa[iw]);
1226             break;
1227         case 3:
1228             radf3_ps(ido, l1, in, out, &wa[iw]);
1229             break;
1230         case 2:
1231             radf2_ps(ido, l1, in, out, &wa[iw]);
1232             break;
1233         default:
1234             assert(0);
1235         }
1236         if(++k1 > nf)
1237             return out;
1238
1239         l2 = l1;
1240         if(out == work2)
1241         {
1242             out = work1;
1243             in = work2;
1244         }
1245         else
1246         {
1247             out = work2;
1248             in = work1;
1249         }
1250     }
1251 } /* rfftf1 */
1252
1253 NOINLINE v4sf *rfftb1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1254     const float *wa, const al::span<const uint,15> ifac)
1255 {
1256     assert(work1 != work2);
1257
1258     const v4sf *in{input_readonly};
1259     v4sf *out{in == work2 ? work1 : work2};
1260     const size_t nf{ifac[1]};
1261     size_t l1{1};
1262     size_t iw{0};
1263     size_t k1{1};
1264     while(true)
1265     {
1266         const size_t ip{ifac[k1 + 1]};
1267         const size_t l2{ip*l1};
1268         const size_t ido{n / l2};
1269         switch(ip)
1270         {
1271         case 5:
1272             radb5_ps(ido, l1, in, out, &wa[iw]);
1273             break;
1274         case 4:
1275             radb4_ps(ido, l1, in, out, &wa[iw]);
1276             break;
1277         case 3:
1278             radb3_ps(ido, l1, in, out, &wa[iw]);
1279             break;
1280         case 2:
1281             radb2_ps(ido, l1, in, out, &wa[iw]);
1282             break;
1283         default:
1284             assert(0);
1285         }
1286         if(++k1 > nf)
1287             return out;
1288
1289         l1 = l2;
1290         iw += (ip - 1)*ido;
1291
1292         if(out == work2)
1293         {
1294             out = work1;
1295             in = work2;
1296         }
1297         else
1298         {
1299             out = work2;
1300             in = work1;
1301         }
1302     }
1303 }
1304
1305 v4sf *cfftf1_ps(const size_t n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1306     const float *wa, const al::span<const uint,15> ifac, const float fsign)
1307 {
1308     assert(work1 != work2);
1309
1310     const v4sf *in{input_readonly};
1311     v4sf *out{in == work2 ? work1 : work2};
1312     const size_t nf{ifac[1]};
1313     size_t l1{1}, iw{0};
1314     size_t k1{2};
1315     while(true)
1316     {
1317         const size_t ip{ifac[k1]};
1318         const size_t l2{ip*l1};
1319         const size_t ido{n / l2};
1320         const size_t idot{ido + ido};
1321         switch(ip)
1322         {
1323         case 5:
1324             passf5_ps(idot, l1, in, out, &wa[iw], fsign);
1325             break;
1326         case 4:
1327             passf4_ps(idot, l1, in, out, &wa[iw], fsign);
1328             break;
1329         case 3:
1330             passf3_ps(idot, l1, in, out, &wa[iw], fsign);
1331             break;
1332         case 2:
1333             passf2_ps(idot, l1, in, out, &wa[iw], fsign);
1334             break;
1335         default:
1336             assert(0);
1337         }
1338         if(++k1 > nf+1)
1339             return out;
1340
1341         l1 = l2;
1342         iw += (ip - 1)*idot;
1343         if(out == work2)
1344         {
1345             out = work1;
1346             in = work2;
1347         }
1348         else
1349         {
1350             out = work2;
1351             in = work1;
1352         }
1353     }
1354 }
1355
1356
1357 uint decompose(const uint n, const al::span<uint,15> ifac, const al::span<const uint,4> ntryh)
1358 {
1359     uint nl{n}, nf{0};
1360     for(const uint ntry : ntryh)
1361     {
1362         while(nl != 1)
1363         {
1364             const uint nq{nl / ntry};
1365             const uint nr{nl % ntry};
1366             if(nr != 0) break;
1367
1368             ifac[2+nf++] = ntry;
1369             nl = nq;
1370             if(ntry == 2 && nf != 1)
1371             {
1372                 for(size_t i{2};i <= nf;++i)
1373                 {
1374                     size_t ib{nf - i + 2};
1375                     ifac[ib + 1] = ifac[ib];
1376                 }
1377                 ifac[2] = 2;
1378             }
1379         }
1380     }
1381     ifac[0] = n;
1382     ifac[1] = nf;
1383     return nf;
1384 }
1385
1386 void rffti1_ps(const uint n, float *wa, const al::span<uint,15> ifac)
1387 {
1388     static constexpr std::array ntryh{4u,2u,3u,5u};
1389
1390     const uint nf{decompose(n, ifac, ntryh)};
1391     const double argh{2.0*al::numbers::pi / n};
1392     size_t is{0};
1393     size_t nfm1{nf - 1};
1394     size_t l1{1};
1395     for(size_t k1{0};k1 < nfm1;++k1)
1396     {
1397         const size_t ip{ifac[k1+2]};
1398         const size_t l2{l1*ip};
1399         const size_t ido{n / l2};
1400         const size_t ipm{ip - 1};
1401         size_t ld{0};
1402         for(size_t j{0};j < ipm;++j)
1403         {
1404             size_t i{is};
1405             ld += l1;
1406             const double argld{static_cast<double>(ld)*argh};
1407             double fi{0.0};
1408             for(size_t ii{2};ii < ido;ii += 2)
1409             {
1410                 fi += 1.0;
1411                 wa[i++] = static_cast<float>(std::cos(fi*argld));
1412                 wa[i++] = static_cast<float>(std::sin(fi*argld));
1413             }
1414             is += ido;
1415         }
1416         l1 = l2;
1417     }
1418 } /* rffti1 */
1419
1420 void cffti1_ps(const uint n, float *wa, const al::span<uint,15> ifac)
1421 {
1422     static constexpr std::array ntryh{5u,3u,4u,2u};
1423
1424     const uint nf{decompose(n, ifac, ntryh)};
1425     const double argh{2.0*al::numbers::pi / n};
1426     size_t i{1};
1427     size_t l1{1};
1428     for(size_t k1{0};k1 < nf;++k1)
1429     {
1430         const size_t ip{ifac[k1+2]};
1431         const size_t l2{l1*ip};
1432         const size_t ido{n / l2};
1433         const size_t idot{ido + ido + 2};
1434         const size_t ipm{ip - 1};
1435         size_t ld{0};
1436         for(size_t j{0};j < ipm;++j)
1437         {
1438             size_t i1{i};
1439             wa[i-1] = 1;
1440             wa[i] = 0;
1441             ld += l1;
1442             const double argld{static_cast<double>(ld)*argh};
1443             double fi{0.0};
1444             for(size_t ii{3};ii < idot;ii += 2)
1445             {
1446                 fi += 1.0;
1447                 wa[++i] = static_cast<float>(std::cos(fi*argld));
1448                 wa[++i] = static_cast<float>(std::sin(fi*argld));
1449             }
1450             if(ip > 5)
1451             {
1452                 wa[i1-1] = wa[i-1];
1453                 wa[i1] = wa[i];
1454             }
1455         }
1456         l1 = l2;
1457     }
1458 } /* cffti1 */
1459
1460 } // namespace
1461
1462 /* NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding) */
1463 struct PFFFT_Setup {
1464     uint N{};
1465     uint Ncvec{}; /* nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) */
1466     std::array<uint,15> ifac{};
1467     pffft_transform_t transform{};
1468
1469     float *twiddle{}; /* N/4 elements */
1470     al::span<v4sf> e; /* N/4*3 elements */
1471
1472     alignas(V4sfAlignment) std::byte end;
1473 };
1474
1475 PFFFTSetupPtr pffft_new_setup(unsigned int N, pffft_transform_t transform)
1476 {
1477     assert(transform == PFFFT_REAL || transform == PFFFT_COMPLEX);
1478     assert(N > 0);
1479     /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1480      * and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1481      * handle other cases (or maybe just switch to a scalar fft, I don't know..)
1482      */
1483     if(transform == PFFFT_REAL)
1484         assert((N%(2*SimdSize*SimdSize)) == 0);
1485     else
1486         assert((N%(SimdSize*SimdSize)) == 0);
1487
1488     const uint Ncvec{(transform == PFFFT_REAL ? N/2 : N) / SimdSize};
1489
1490     const size_t storelen{std::max(offsetof(PFFFT_Setup, end) + 2_zu*Ncvec*sizeof(v4sf),
1491         sizeof(PFFFT_Setup))};
1492     auto storage = static_cast<gsl::owner<std::byte*>>(::operator new[](storelen, V4sfAlignVal));
1493     al::span extrastore{&storage[offsetof(PFFFT_Setup, end)], 2_zu*Ncvec*sizeof(v4sf)};
1494
1495     PFFFTSetupPtr s{::new(storage) PFFFT_Setup{}};
1496     s->N = N;
1497     s->transform = transform;
1498     s->Ncvec = Ncvec;
1499
1500     const size_t ecount{2_zu*Ncvec*(SimdSize-1)/SimdSize};
1501     s->e = {std::launder(reinterpret_cast<v4sf*>(extrastore.data())), ecount};
1502     s->twiddle = std::launder(reinterpret_cast<float*>(&extrastore[ecount*sizeof(v4sf)]));
1503
1504     if constexpr(SimdSize > 1)
1505     {
1506         auto e = std::vector<float>(s->e.size()*SimdSize, 0.0f);
1507         for(size_t k{0};k < s->Ncvec;++k)
1508         {
1509             const size_t i{k / SimdSize};
1510             const size_t j{k % SimdSize};
1511             for(size_t m{0};m < SimdSize-1;++m)
1512             {
1513                 const double A{-2.0*al::numbers::pi*static_cast<double>((m+1)*k) / N};
1514                 e[((i*3 + m)*2 + 0)*SimdSize + j] = static_cast<float>(std::cos(A));
1515                 e[((i*3 + m)*2 + 1)*SimdSize + j] = static_cast<float>(std::sin(A));
1516             }
1517         }
1518         std::memcpy(s->e.data(), e.data(), e.size()*sizeof(float));
1519     }
1520     if(transform == PFFFT_REAL)
1521         rffti1_ps(N/SimdSize, s->twiddle, s->ifac);
1522     else
1523         cffti1_ps(N/SimdSize, s->twiddle, s->ifac);
1524
1525     /* check that N is decomposable with allowed prime factors */
1526     size_t m{1};
1527     for(size_t k{0};k < s->ifac[1];++k)
1528         m *= s->ifac[2+k];
1529
1530     if(m != N/SimdSize)
1531         s = nullptr;
1532
1533     return s;
1534 }
1535
1536
1537 void pffft_destroy_setup(gsl::owner<PFFFT_Setup*> s) noexcept
1538 {
1539     std::destroy_at(s);
1540     ::operator delete[](gsl::owner<void*>{s}, V4sfAlignVal);
1541 }
1542
1543 #if !defined(PFFFT_SIMD_DISABLE)
1544
1545 namespace {
1546
1547 /* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
1548 void reversed_copy(const size_t N, const v4sf *in, const int in_stride, v4sf *RESTRICT out)
1549 {
1550     v4sf g0, g1;
1551     interleave2(in[0], in[1], g0, g1);
1552     in += in_stride;
1553
1554     *--out = vswaphl(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
1555     for(size_t k{1};k < N;++k)
1556     {
1557         v4sf h0, h1;
1558         interleave2(in[0], in[1], h0, h1);
1559         in += in_stride;
1560         *--out = vswaphl(g1, h0);
1561         *--out = vswaphl(h0, h1);
1562         g1 = h1;
1563     }
1564     *--out = vswaphl(g1, g0);
1565 }
1566
1567 void unreversed_copy(const size_t N, const v4sf *in, v4sf *RESTRICT out, const int out_stride)
1568 {
1569     v4sf g0{in[0]}, g1{g0};
1570     ++in;
1571     for(size_t k{1};k < N;++k)
1572     {
1573         v4sf h0{*in++}; v4sf h1{*in++};
1574         g1 = vswaphl(g1, h0);
1575         h0 = vswaphl(h0, h1);
1576         uninterleave2(h0, g1, out[0], out[1]);
1577         out += out_stride;
1578         g1 = h1;
1579     }
1580     v4sf h0{*in++}, h1{g0};
1581     g1 = vswaphl(g1, h0);
1582     h0 = vswaphl(h0, h1);
1583     uninterleave2(h0, g1, out[0], out[1]);
1584 }
1585
1586 void pffft_cplx_finalize(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out, const v4sf *e)
1587 {
1588     assert(in != out);
1589
1590     const size_t dk{Ncvec/SimdSize}; // number of 4x4 matrix blocks
1591     for(size_t k{0};k < dk;++k)
1592     {
1593         v4sf r0{in[8*k+0]}, i0{in[8*k+1]};
1594         v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
1595         v4sf r2{in[8*k+4]}, i2{in[8*k+5]};
1596         v4sf r3{in[8*k+6]}, i3{in[8*k+7]};
1597         vtranspose4(r0,r1,r2,r3);
1598         vtranspose4(i0,i1,i2,i3);
1599         vcplxmul(r1,i1,e[k*6+0],e[k*6+1]);
1600         vcplxmul(r2,i2,e[k*6+2],e[k*6+3]);
1601         vcplxmul(r3,i3,e[k*6+4],e[k*6+5]);
1602
1603         v4sf sr0{vadd(r0,r2)}, dr0{vsub(r0, r2)};
1604         v4sf sr1{vadd(r1,r3)}, dr1{vsub(r1, r3)};
1605         v4sf si0{vadd(i0,i2)}, di0{vsub(i0, i2)};
1606         v4sf si1{vadd(i1,i3)}, di1{vsub(i1, i3)};
1607
1608         /* transformation for each column is:
1609          *
1610          * [1   1   1   1   0   0   0   0]   [r0]
1611          * [1   0  -1   0   0  -1   0   1]   [r1]
1612          * [1  -1   1  -1   0   0   0   0]   [r2]
1613          * [1   0  -1   0   0   1   0  -1]   [r3]
1614          * [0   0   0   0   1   1   1   1] * [i0]
1615          * [0   1   0  -1   1   0  -1   0]   [i1]
1616          * [0   0   0   0   1  -1   1  -1]   [i2]
1617          * [0  -1   0   1   1   0  -1   0]   [i3]
1618          */
1619
1620         r0 = vadd(sr0, sr1); i0 = vadd(si0, si1);
1621         r1 = vadd(dr0, di1); i1 = vsub(di0, dr1);
1622         r2 = vsub(sr0, sr1); i2 = vsub(si0, si1);
1623         r3 = vsub(dr0, di1); i3 = vadd(di0, dr1);
1624
1625         *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1626         *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1627     }
1628 }
1629
1630 void pffft_cplx_preprocess(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out, const v4sf *e)
1631 {
1632     assert(in != out);
1633
1634     const size_t dk{Ncvec/SimdSize}; // number of 4x4 matrix blocks
1635     for(size_t k{0};k < dk;++k)
1636     {
1637         v4sf r0{in[8*k+0]}, i0{in[8*k+1]};
1638         v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
1639         v4sf r2{in[8*k+4]}, i2{in[8*k+5]};
1640         v4sf r3{in[8*k+6]}, i3{in[8*k+7]};
1641
1642         v4sf sr0{vadd(r0,r2)}, dr0{vsub(r0, r2)};
1643         v4sf sr1{vadd(r1,r3)}, dr1{vsub(r1, r3)};
1644         v4sf si0{vadd(i0,i2)}, di0{vsub(i0, i2)};
1645         v4sf si1{vadd(i1,i3)}, di1{vsub(i1, i3)};
1646
1647         r0 = vadd(sr0, sr1); i0 = vadd(si0, si1);
1648         r1 = vsub(dr0, di1); i1 = vadd(di0, dr1);
1649         r2 = vsub(sr0, sr1); i2 = vsub(si0, si1);
1650         r3 = vadd(dr0, di1); i3 = vsub(di0, dr1);
1651
1652         vcplxmulconj(r1,i1,e[k*6+0],e[k*6+1]);
1653         vcplxmulconj(r2,i2,e[k*6+2],e[k*6+3]);
1654         vcplxmulconj(r3,i3,e[k*6+4],e[k*6+5]);
1655
1656         vtranspose4(r0,r1,r2,r3);
1657         vtranspose4(i0,i1,i2,i3);
1658
1659         *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1660         *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1661     }
1662 }
1663
1664
1665 force_inline void pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
1666     const v4sf *e, v4sf *RESTRICT out)
1667 {
1668     v4sf r0{*in0}, i0{*in1};
1669     v4sf r1{*in++}; v4sf i1{*in++};
1670     v4sf r2{*in++}; v4sf i2{*in++};
1671     v4sf r3{*in++}; v4sf i3{*in++};
1672     vtranspose4(r0,r1,r2,r3);
1673     vtranspose4(i0,i1,i2,i3);
1674
1675     /* transformation for each column is:
1676      *
1677      * [1   1   1   1   0   0   0   0]   [r0]
1678      * [1   0  -1   0   0  -1   0   1]   [r1]
1679      * [1   0  -1   0   0   1   0  -1]   [r2]
1680      * [1  -1   1  -1   0   0   0   0]   [r3]
1681      * [0   0   0   0   1   1   1   1] * [i0]
1682      * [0  -1   0   1  -1   0   1   0]   [i1]
1683      * [0  -1   0   1   1   0  -1   0]   [i2]
1684      * [0   0   0   0  -1   1  -1   1]   [i3]
1685      */
1686
1687     //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1688     //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1689
1690     vcplxmul(r1,i1,e[0],e[1]);
1691     vcplxmul(r2,i2,e[2],e[3]);
1692     vcplxmul(r3,i3,e[4],e[5]);
1693
1694     //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1695     //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1696
1697     v4sf sr0{vadd(r0,r2)}, dr0{vsub(r0,r2)};
1698     v4sf sr1{vadd(r1,r3)}, dr1{vsub(r3,r1)};
1699     v4sf si0{vadd(i0,i2)}, di0{vsub(i0,i2)};
1700     v4sf si1{vadd(i1,i3)}, di1{vsub(i3,i1)};
1701
1702     r0 = vadd(sr0, sr1);
1703     r3 = vsub(sr0, sr1);
1704     i0 = vadd(si0, si1);
1705     i3 = vsub(si1, si0);
1706     r1 = vadd(dr0, di1);
1707     r2 = vsub(dr0, di1);
1708     i1 = vsub(dr1, di0);
1709     i2 = vadd(dr1, di0);
1710
1711     *out++ = r0;
1712     *out++ = i0;
1713     *out++ = r1;
1714     *out++ = i1;
1715     *out++ = r2;
1716     *out++ = i2;
1717     *out++ = r3;
1718     *out++ = i3;
1719 }
1720
1721 NOINLINE void pffft_real_finalize(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out,
1722     const v4sf *e)
1723 {
1724     static constexpr float s{al::numbers::sqrt2_v<float>/2.0f};
1725
1726     assert(in != out);
1727     const size_t dk{Ncvec/SimdSize}; // number of 4x4 matrix blocks
1728     /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1729
1730     const v4sf zero{vzero()};
1731     const auto cr = al::bit_cast<std::array<float,SimdSize>>(in[0]);
1732     const auto ci = al::bit_cast<std::array<float,SimdSize>>(in[Ncvec*2-1]);
1733     pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
1734
1735     /* [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1736      *
1737      * [Xr(1)]  ] [1   1   1   1   0   0   0   0]
1738      * [Xr(N/4) ] [0   0   0   0   1   s   0  -s]
1739      * [Xr(N/2) ] [1   0  -1   0   0   0   0   0]
1740      * [Xr(3N/4)] [0   0   0   0   1  -s   0   s]
1741      * [Xi(1)   ] [1  -1   1  -1   0   0   0   0]
1742      * [Xi(N/4) ] [0   0   0   0   0  -s  -1  -s]
1743      * [Xi(N/2) ] [0  -1   0   1   0   0   0   0]
1744      * [Xi(3N/4)] [0   0   0   0   0  -s   1  -s]
1745      */
1746
1747     const float xr0{(cr[0]+cr[2]) + (cr[1]+cr[3])}; out[0] = vinsert0(out[0], xr0);
1748     const float xi0{(cr[0]+cr[2]) - (cr[1]+cr[3])}; out[1] = vinsert0(out[1], xi0);
1749     const float xr2{(cr[0]-cr[2])};                 out[4] = vinsert0(out[4], xr2);
1750     const float xi2{(cr[3]-cr[1])};                 out[5] = vinsert0(out[5], xi2);
1751     const float xr1{ ci[0] + s*(ci[1]-ci[3])};      out[2] = vinsert0(out[2], xr1);
1752     const float xi1{-ci[2] - s*(ci[1]+ci[3])};      out[3] = vinsert0(out[3], xi1);
1753     const float xr3{ ci[0] - s*(ci[1]-ci[3])};      out[6] = vinsert0(out[6], xr3);
1754     const float xi3{ ci[2] - s*(ci[1]+ci[3])};      out[7] = vinsert0(out[7], xi3);
1755
1756     for(size_t k{1};k < dk;++k)
1757         pffft_real_finalize_4x4(&in[8*k-1], &in[8*k+0], in + 8*k+1, e + k*6, out + k*8);
1758 }
1759
1760 force_inline void pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4sf *RESTRICT out,
1761     const bool first)
1762 {
1763     v4sf r0{in[0]}, i0{in[1]}, r1{in[2]}, i1{in[3]};
1764     v4sf r2{in[4]}, i2{in[5]}, r3{in[6]}, i3{in[7]};
1765
1766     /* transformation for each column is:
1767      *
1768      * [1   1   1   1   0   0   0   0]   [r0]
1769      * [1   0   0  -1   0  -1  -1   0]   [r1]
1770      * [1  -1  -1   1   0   0   0   0]   [r2]
1771      * [1   0   0  -1   0   1   1   0]   [r3]
1772      * [0   0   0   0   1  -1   1  -1] * [i0]
1773      * [0  -1   1   0   1   0   0   1]   [i1]
1774      * [0   0   0   0   1   1  -1  -1]   [i2]
1775      * [0   1  -1   0   1   0   0   1]   [i3]
1776      */
1777
1778     v4sf sr0{vadd(r0,r3)}, dr0{vsub(r0,r3)};
1779     v4sf sr1{vadd(r1,r2)}, dr1{vsub(r1,r2)};
1780     v4sf si0{vadd(i0,i3)}, di0{vsub(i0,i3)};
1781     v4sf si1{vadd(i1,i2)}, di1{vsub(i1,i2)};
1782
1783     r0 = vadd(sr0, sr1);
1784     r2 = vsub(sr0, sr1);
1785     r1 = vsub(dr0, si1);
1786     r3 = vadd(dr0, si1);
1787     i0 = vsub(di0, di1);
1788     i2 = vadd(di0, di1);
1789     i1 = vsub(si0, dr1);
1790     i3 = vadd(si0, dr1);
1791
1792     vcplxmulconj(r1,i1,e[0],e[1]);
1793     vcplxmulconj(r2,i2,e[2],e[3]);
1794     vcplxmulconj(r3,i3,e[4],e[5]);
1795
1796     vtranspose4(r0,r1,r2,r3);
1797     vtranspose4(i0,i1,i2,i3);
1798
1799     if(!first)
1800     {
1801         *out++ = r0;
1802         *out++ = i0;
1803     }
1804     *out++ = r1;
1805     *out++ = i1;
1806     *out++ = r2;
1807     *out++ = i2;
1808     *out++ = r3;
1809     *out++ = i3;
1810 }
1811
1812 NOINLINE void pffft_real_preprocess(const size_t Ncvec, const v4sf *in, v4sf *RESTRICT out,
1813     const v4sf *e)
1814 {
1815     static constexpr float sqrt2{al::numbers::sqrt2_v<float>};
1816
1817     assert(in != out);
1818     const size_t dk{Ncvec/SimdSize}; // number of 4x4 matrix blocks
1819     /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1820
1821     std::array<float,SimdSize> Xr{}, Xi{};
1822     for(size_t k{0};k < SimdSize;++k)
1823     {
1824         Xr[k] = vextract0(in[2*k]);
1825         Xi[k] = vextract0(in[2*k + 1]);
1826     }
1827
1828     pffft_real_preprocess_4x4(in, e, out+1, true); // will write only 6 values
1829
1830     /* [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1831      *
1832      * [cr0] [1   0   2   0   1   0   0   0]
1833      * [cr1] [1   0   0   0  -1   0  -2   0]
1834      * [cr2] [1   0  -2   0   1   0   0   0]
1835      * [cr3] [1   0   0   0  -1   0   2   0]
1836      * [ci0] [0   2   0   2   0   0   0   0]
1837      * [ci1] [0   s   0  -s   0  -s   0  -s]
1838      * [ci2] [0   0   0   0   0  -2   0   2]
1839      * [ci3] [0  -s   0   s   0  -s   0  -s]
1840      */
1841     for(size_t k{1};k < dk;++k)
1842         pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, false);
1843
1844     const float cr0{(Xr[0]+Xi[0]) + 2*Xr[2]};
1845     const float cr1{(Xr[0]-Xi[0]) - 2*Xi[2]};
1846     const float cr2{(Xr[0]+Xi[0]) - 2*Xr[2]};
1847     const float cr3{(Xr[0]-Xi[0]) + 2*Xi[2]};
1848     out[0] = vset4(cr0, cr1, cr2, cr3);
1849     const float ci0{     2*(Xr[1]+Xr[3])};
1850     const float ci1{ sqrt2*(Xr[1]-Xr[3]) - sqrt2*(Xi[1]+Xi[3])};
1851     const float ci2{     2*(Xi[3]-Xi[1])};
1852     const float ci3{-sqrt2*(Xr[1]-Xr[3]) - sqrt2*(Xi[1]+Xi[3])};
1853     out[2*Ncvec-1] = vset4(ci0, ci1, ci2, ci3);
1854 }
1855
1856
1857 void pffft_transform_internal(const PFFFT_Setup *setup, const v4sf *vinput, v4sf *voutput,
1858     v4sf *scratch, const pffft_direction_t direction, const bool ordered)
1859 {
1860     assert(scratch != nullptr);
1861     assert(voutput != scratch);
1862
1863     const size_t Ncvec{setup->Ncvec};
1864     const bool nf_odd{(setup->ifac[1]&1) != 0};
1865
1866     std::array buff{voutput, scratch};
1867     bool ib{nf_odd != ordered};
1868     if(direction == PFFFT_FORWARD)
1869     {
1870         /* Swap the initial work buffer for forward FFTs, which helps avoid an
1871          * extra copy for output.
1872          */
1873         ib = !ib;
1874         if(setup->transform == PFFFT_REAL)
1875         {
1876             ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib], setup->twiddle, setup->ifac) == buff[1]);
1877             pffft_real_finalize(Ncvec, buff[ib], buff[!ib], setup->e.data());
1878         }
1879         else
1880         {
1881             v4sf *tmp{buff[ib]};
1882             for(size_t k=0; k < Ncvec; ++k)
1883                 uninterleave2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
1884
1885             ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], setup->twiddle, setup->ifac, -1.0f) == buff[1]);
1886             pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], setup->e.data());
1887         }
1888         if(ordered)
1889             pffft_zreorder(setup, reinterpret_cast<float*>(buff[!ib]),
1890                 reinterpret_cast<float*>(buff[ib]), PFFFT_FORWARD);
1891         else
1892             ib = !ib;
1893     }
1894     else
1895     {
1896         if(vinput == buff[ib])
1897             ib = !ib; // may happen when finput == foutput
1898
1899         if(ordered)
1900         {
1901             pffft_zreorder(setup, reinterpret_cast<const float*>(vinput),
1902                 reinterpret_cast<float*>(buff[ib]), PFFFT_BACKWARD);
1903             vinput = buff[ib];
1904             ib = !ib;
1905         }
1906         if(setup->transform == PFFFT_REAL)
1907         {
1908             pffft_real_preprocess(Ncvec, vinput, buff[ib], setup->e.data());
1909             ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], setup->twiddle, setup->ifac) == buff[1]);
1910         }
1911         else
1912         {
1913             pffft_cplx_preprocess(Ncvec, vinput, buff[ib], setup->e.data());
1914             ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],  setup->twiddle, setup->ifac, +1.0f) == buff[1]);
1915             for(size_t k{0};k < Ncvec;++k)
1916                 interleave2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
1917         }
1918     }
1919
1920     if(buff[ib] != voutput)
1921     {
1922         /* extra copy required -- this situation should only happen when finput == foutput */
1923         assert(vinput==voutput);
1924         for(size_t k{0};k < Ncvec;++k)
1925         {
1926             v4sf a{buff[ib][2*k]}, b{buff[ib][2*k+1]};
1927             voutput[2*k] = a; voutput[2*k+1] = b;
1928         }
1929     }
1930 }
1931
1932 } // namespace
1933
1934 void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *out,
1935     pffft_direction_t direction)
1936 {
1937     assert(in != out);
1938
1939     const size_t N{setup->N}, Ncvec{setup->Ncvec};
1940     const v4sf *vin{reinterpret_cast<const v4sf*>(in)};
1941     v4sf *RESTRICT vout{reinterpret_cast<v4sf*>(out)};
1942     if(setup->transform == PFFFT_REAL)
1943     {
1944         const size_t dk{N/32};
1945         if(direction == PFFFT_FORWARD)
1946         {
1947             for(size_t k{0};k < dk;++k)
1948             {
1949                 interleave2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
1950                 interleave2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
1951             }
1952             reversed_copy(dk, vin+2, 8, vout + N/SimdSize/2);
1953             reversed_copy(dk, vin+6, 8, vout + N/SimdSize);
1954         }
1955         else
1956         {
1957             for(size_t k{0};k < dk;++k)
1958             {
1959                 uninterleave2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
1960                 uninterleave2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
1961             }
1962             unreversed_copy(dk, vin + N/SimdSize/4, vout + N/SimdSize - 6, -8);
1963             unreversed_copy(dk, vin + 3_uz*N/SimdSize/4, vout + N/SimdSize - 2, -8);
1964         }
1965     }
1966     else
1967     {
1968         if(direction == PFFFT_FORWARD)
1969         {
1970             for(size_t k{0};k < Ncvec;++k)
1971             {
1972                 size_t kk{(k/4) + (k%4)*(Ncvec/4)};
1973                 interleave2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
1974             }
1975         }
1976         else
1977         {
1978             for(size_t k{0};k < Ncvec;++k)
1979             {
1980                 size_t kk{(k/4) + (k%4)*(Ncvec/4)};
1981                 uninterleave2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
1982             }
1983         }
1984     }
1985 }
1986
1987 void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *s, const float *a, const float *b,
1988     float *ab, float scaling)
1989 {
1990     const size_t Ncvec{s->Ncvec};
1991     const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)};
1992     const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)};
1993     v4sf *RESTRICT vab{reinterpret_cast<v4sf*>(ab)};
1994
1995 #ifdef __arm__
1996     __builtin_prefetch(va);
1997     __builtin_prefetch(vb);
1998     __builtin_prefetch(vab);
1999     __builtin_prefetch(va+2);
2000     __builtin_prefetch(vb+2);
2001     __builtin_prefetch(vab+2);
2002     __builtin_prefetch(va+4);
2003     __builtin_prefetch(vb+4);
2004     __builtin_prefetch(vab+4);
2005     __builtin_prefetch(va+6);
2006     __builtin_prefetch(vb+6);
2007     __builtin_prefetch(vab+6);
2008 #ifndef __clang__
2009 #define ZCONVOLVE_USING_INLINE_NEON_ASM
2010 #endif
2011 #endif
2012
2013     const float ar1{vextract0(va[0])};
2014     const float ai1{vextract0(va[1])};
2015     const float br1{vextract0(vb[0])};
2016     const float bi1{vextract0(vb[1])};
2017     const float abr1{vextract0(vab[0])};
2018     const float abi1{vextract0(vab[1])};
2019
2020 #ifdef ZCONVOLVE_USING_INLINE_ASM
2021     /* Inline asm version, unfortunately miscompiled by clang 3.2, at least on
2022      * Ubuntu. So this will be restricted to GCC.
2023      *
2024      * Does it still miscompile with Clang? Is it even needed with today's
2025      * optimizers?
2026      */
2027     const float *a_{a}, *b_{b}; float *ab_{ab};
2028     size_t N{Ncvec};
2029     asm volatile("mov         r8, %2                  \n"
2030                 "vdup.f32    q15, %4                 \n"
2031                 "1:                                  \n"
2032                 "pld         [%0,#64]                \n"
2033                 "pld         [%1,#64]                \n"
2034                 "pld         [%2,#64]                \n"
2035                 "pld         [%0,#96]                \n"
2036                 "pld         [%1,#96]                \n"
2037                 "pld         [%2,#96]                \n"
2038                 "vld1.f32    {q0,q1},   [%0,:128]!         \n"
2039                 "vld1.f32    {q4,q5},   [%1,:128]!         \n"
2040                 "vld1.f32    {q2,q3},   [%0,:128]!         \n"
2041                 "vld1.f32    {q6,q7},   [%1,:128]!         \n"
2042                 "vld1.f32    {q8,q9},   [r8,:128]!          \n"
2043
2044                 "vmul.f32    q10, q0, q4             \n"
2045                 "vmul.f32    q11, q0, q5             \n"
2046                 "vmul.f32    q12, q2, q6             \n"
2047                 "vmul.f32    q13, q2, q7             \n"
2048                 "vmls.f32    q10, q1, q5             \n"
2049                 "vmla.f32    q11, q1, q4             \n"
2050                 "vld1.f32    {q0,q1}, [r8,:128]!     \n"
2051                 "vmls.f32    q12, q3, q7             \n"
2052                 "vmla.f32    q13, q3, q6             \n"
2053                 "vmla.f32    q8, q10, q15            \n"
2054                 "vmla.f32    q9, q11, q15            \n"
2055                 "vmla.f32    q0, q12, q15            \n"
2056                 "vmla.f32    q1, q13, q15            \n"
2057                 "vst1.f32    {q8,q9},[%2,:128]!    \n"
2058                 "vst1.f32    {q0,q1},[%2,:128]!    \n"
2059                 "subs        %3, #2                  \n"
2060                 "bne         1b                      \n"
2061                 : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
2062
2063 #else
2064
2065     /* Default routine, works fine for non-arm cpus with current compilers. */
2066     const v4sf vscal{ld_ps1(scaling)};
2067     for(size_t i{0};i < Ncvec;i += 2)
2068     {
2069         v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
2070         v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
2071         vcplxmul(ar4, ai4, br4, bi4);
2072         vab[2*i+0] = vmadd(ar4, vscal, vab[2*i+0]);
2073         vab[2*i+1] = vmadd(ai4, vscal, vab[2*i+1]);
2074         ar4 = va[2*i+2]; ai4 = va[2*i+3];
2075         br4 = vb[2*i+2]; bi4 = vb[2*i+3];
2076         vcplxmul(ar4, ai4, br4, bi4);
2077         vab[2*i+2] = vmadd(ar4, vscal, vab[2*i+2]);
2078         vab[2*i+3] = vmadd(ai4, vscal, vab[2*i+3]);
2079     }
2080 #endif
2081
2082     if(s->transform == PFFFT_REAL)
2083     {
2084         vab[0] = vinsert0(vab[0], abr1 + ar1*br1*scaling);
2085         vab[1] = vinsert0(vab[1], abi1 + ai1*bi1*scaling);
2086     }
2087 }
2088
2089 void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const float *b, float *ab)
2090 {
2091     const size_t Ncvec{s->Ncvec};
2092     const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)};
2093     const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)};
2094     v4sf *RESTRICT vab{reinterpret_cast<v4sf*>(ab)};
2095
2096 #ifdef __arm__
2097     __builtin_prefetch(va);
2098     __builtin_prefetch(vb);
2099     __builtin_prefetch(vab);
2100     __builtin_prefetch(va+2);
2101     __builtin_prefetch(vb+2);
2102     __builtin_prefetch(vab+2);
2103     __builtin_prefetch(va+4);
2104     __builtin_prefetch(vb+4);
2105     __builtin_prefetch(vab+4);
2106     __builtin_prefetch(va+6);
2107     __builtin_prefetch(vb+6);
2108     __builtin_prefetch(vab+6);
2109 #endif
2110
2111     const float ar1{vextract0(va[0])};
2112     const float ai1{vextract0(va[1])};
2113     const float br1{vextract0(vb[0])};
2114     const float bi1{vextract0(vb[1])};
2115     const float abr1{vextract0(vab[0])};
2116     const float abi1{vextract0(vab[1])};
2117
2118     /* No inline assembly for this version. I'm not familiar enough with NEON
2119      * assembly, and I don't know that it's needed with today's optimizers.
2120      */
2121     for(size_t i{0};i < Ncvec;i += 2)
2122     {
2123         v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
2124         v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
2125         vcplxmul(ar4, ai4, br4, bi4);
2126         vab[2*i+0] = vadd(ar4, vab[2*i+0]);
2127         vab[2*i+1] = vadd(ai4, vab[2*i+1]);
2128         ar4 = va[2*i+2]; ai4 = va[2*i+3];
2129         br4 = vb[2*i+2]; bi4 = vb[2*i+3];
2130         vcplxmul(ar4, ai4, br4, bi4);
2131         vab[2*i+2] = vadd(ar4, vab[2*i+2]);
2132         vab[2*i+3] = vadd(ai4, vab[2*i+3]);
2133     }
2134
2135     if(s->transform == PFFFT_REAL)
2136     {
2137         vab[0] = vinsert0(vab[0], abr1 + ar1*br1);
2138         vab[1] = vinsert0(vab[1], abi1 + ai1*bi1);
2139     }
2140 }
2141
2142
2143 void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output, float *work,
2144     pffft_direction_t direction)
2145 {
2146     assert(valigned(input) && valigned(output) && valigned(work));
2147     pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)),
2148         reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)),
2149         reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, false);
2150 }
2151
2152 void pffft_transform_ordered(const PFFFT_Setup *setup, const float *input, float *output,
2153     float *work, pffft_direction_t direction)
2154 {
2155     assert(valigned(input) && valigned(output) && valigned(work));
2156     pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)),
2157         reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)),
2158         reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, true);
2159 }
2160
2161 #else // defined(PFFFT_SIMD_DISABLE)
2162
2163 // standard routine using scalar floats, without SIMD stuff.
2164
2165 namespace {
2166
2167 void pffft_transform_internal(const PFFFT_Setup *setup, const float *input, float *output,
2168     float *scratch, const pffft_direction_t direction, bool ordered)
2169 {
2170     const size_t Ncvec{setup->Ncvec};
2171     const bool nf_odd{(setup->ifac[1]&1) != 0};
2172
2173     assert(scratch != nullptr);
2174
2175     /* z-domain data for complex transforms is already ordered without SIMD. */
2176     if(setup->transform == PFFFT_COMPLEX)
2177         ordered = false;
2178
2179     float *buff[2]{output, scratch};
2180     bool ib{nf_odd != ordered};
2181     if(direction == PFFFT_FORWARD)
2182     {
2183         if(setup->transform == PFFFT_REAL)
2184             ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac) == buff[1]);
2185         else
2186             ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac, -1.0f) == buff[1]);
2187         if(ordered)
2188         {
2189             pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD);
2190             ib = !ib;
2191         }
2192     }
2193     else
2194     {
2195         if (input == buff[ib])
2196             ib = !ib; // may happen when finput == foutput
2197
2198         if(ordered)
2199         {
2200             pffft_zreorder(setup, input, buff[ib], PFFFT_BACKWARD);
2201             input = buff[ib];
2202             ib = !ib;
2203         }
2204         if(setup->transform == PFFFT_REAL)
2205             ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],  setup->twiddle, setup->ifac) == buff[1]);
2206         else
2207             ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac, +1.0f) == buff[1]);
2208     }
2209     if(buff[ib] != output)
2210     {
2211         // extra copy required -- this situation should happens only when finput == foutput
2212         assert(input==output);
2213         for(size_t k{0};k < Ncvec;++k)
2214         {
2215             float a{buff[ib][2*k]}, b{buff[ib][2*k+1]};
2216             output[2*k] = a; output[2*k+1] = b;
2217         }
2218     }
2219 }
2220
2221 } // namespace
2222
2223 void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *RESTRICT out,
2224     pffft_direction_t direction)
2225 {
2226     const size_t N{setup->N};
2227     if(setup->transform == PFFFT_COMPLEX)
2228     {
2229         for(size_t k{0};k < 2*N;++k)
2230             out[k] = in[k];
2231         return;
2232     }
2233     else if(direction == PFFFT_FORWARD)
2234     {
2235         float x_N{in[N-1]};
2236         for(size_t k{N-1};k > 1;--k)
2237             out[k] = in[k-1];
2238         out[0] = in[0];
2239         out[1] = x_N;
2240     }
2241     else
2242     {
2243         float x_N{in[1]};
2244         for(size_t k{1};k < N-1;++k)
2245             out[k] = in[k+1];
2246         out[0] = in[0];
2247         out[N-1] = x_N;
2248     }
2249 }
2250
2251 void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *s, const float *a, const float *b,
2252     float *ab, float scaling)
2253 {
2254     size_t Ncvec{s->Ncvec};
2255
2256     if(s->transform == PFFFT_REAL)
2257     {
2258         // take care of the fftpack ordering
2259         ab[0] += a[0]*b[0]*scaling;
2260         ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
2261         ++ab; ++a; ++b; --Ncvec;
2262     }
2263     for(size_t i{0};i < Ncvec;++i)
2264     {
2265         float ar{a[2*i+0]}, ai{a[2*i+1]};
2266         const float br{b[2*i+0]}, bi{b[2*i+1]};
2267         vcplxmul(ar, ai, br, bi);
2268         ab[2*i+0] += ar*scaling;
2269         ab[2*i+1] += ai*scaling;
2270     }
2271 }
2272
2273 void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const float *b, float *ab)
2274 {
2275     size_t Ncvec{s->Ncvec};
2276
2277     if(s->transform == PFFFT_REAL)
2278     {
2279         // take care of the fftpack ordering
2280         ab[0] += a[0]*b[0];
2281         ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1];
2282         ++ab; ++a; ++b; --Ncvec;
2283     }
2284     for(size_t i{0};i < Ncvec;++i)
2285     {
2286         float ar{a[2*i+0]}, ai{a[2*i+1]};
2287         const float br{b[2*i+0]}, bi{b[2*i+1]};
2288         vcplxmul(ar, ai, br, bi);
2289         ab[2*i+0] += ar;
2290         ab[2*i+1] += ai;
2291     }
2292 }
2293
2294
2295 void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output, float *work,
2296     pffft_direction_t direction)
2297 {
2298     pffft_transform_internal(setup, input, output, work, direction, false);
2299 }
2300
2301 void pffft_transform_ordered(const PFFFT_Setup *setup, const float *input, float *output,
2302     float *work, pffft_direction_t direction)
2303 {
2304     pffft_transform_internal(setup, input, output, work, direction, true);
2305 }
2306
2307 #endif /* defined(PFFFT_SIMD_DISABLE) */
2308 /* NOLINTEND(cppcoreguidelines-pro-bounds-pointer-arithmetic) */