aarch64/fpu: Add vector variants of erf
[glibc.git] / sysdeps / aarch64 / fpu / erff_advsimd.c
blobc44644a71cffbb6213a8f55e4e655516d108f1d4
1 /* Single-precision vector (Advanced SIMD) erf function
3 Copyright (C) 2024 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
20 #include "v_math.h"
22 static const struct data
24 float32x4_t max, shift, third;
25 #if WANT_SIMD_EXCEPT
26 float32x4_t tiny_bound, scale_minus_one;
27 #endif
28 } data = {
29 .max = V4 (3.9375), /* 4 - 8/128. */
30 .shift = V4 (0x1p16f),
31 .third = V4 (0x1.555556p-2f), /* 1/3. */
32 #if WANT_SIMD_EXCEPT
33 .tiny_bound = V4 (0x1p-62f),
34 .scale_minus_one = V4 (0x1.06eba8p-3f), /* scale - 1.0. */
35 #endif
38 #define AbsMask 0x7fffffff
40 struct entry
42 float32x4_t erf;
43 float32x4_t scale;
46 static inline struct entry
47 lookup (uint32x4_t i)
49 struct entry e;
50 float64_t t0 = *((float64_t *) (__erff_data.tab + i[0]));
51 float64_t t1 = *((float64_t *) (__erff_data.tab + i[1]));
52 float64_t t2 = *((float64_t *) (__erff_data.tab + i[2]));
53 float64_t t3 = *((float64_t *) (__erff_data.tab + i[3]));
54 float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
55 float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
56 e.erf = vuzp1q_f32 (e1, e2);
57 e.scale = vuzp2q_f32 (e1, e2);
58 return e;
61 /* Single-precision implementation of vector erf(x).
62 Approximation based on series expansion near x rounded to
63 nearest multiple of 1/128.
64 Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
66 erf(x) ~ erf(r) + scale * d * [1 - r * d - 1/3 * d^2]
68 Values of erf(r) and scale are read from lookup tables.
69 For |x| > 3.9375, erf(|x|) rounds to 1.0f.
71 Maximum error: 1.93 ULP
72 _ZGVnN4v_erff(0x1.c373e6p-9) got 0x1.fd686cp-9
73 want 0x1.fd6868p-9. */
74 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erf) (float32x4_t x)
76 const struct data *dat = ptr_barrier (&data);
78 #if WANT_SIMD_EXCEPT
79 /* |x| < 2^-62. */
80 uint32x4_t cmp = vcaltq_f32 (x, dat->tiny_bound);
81 float32x4_t xm = x;
82 /* If any lanes are special, mask them with 1 and retain a copy of x to allow
83 special case handler to fix special lanes later. This is only necessary if
84 fenv exceptions are to be triggered correctly. */
85 if (__glibc_unlikely (v_any_u32 (cmp)))
86 x = vbslq_f32 (cmp, v_f32 (1), x);
87 #endif
89 float32x4_t a = vabsq_f32 (x);
90 uint32x4_t a_gt_max = vcgtq_f32 (a, dat->max);
92 /* Lookup erf(r) and scale(r) in tables, e.g. set erf(r) to 0 and scale to
93 2/sqrt(pi), when x reduced to r = 0. */
94 float32x4_t shift = dat->shift;
95 float32x4_t z = vaddq_f32 (a, shift);
97 uint32x4_t i
98 = vsubq_u32 (vreinterpretq_u32_f32 (z), vreinterpretq_u32_f32 (shift));
99 i = vminq_u32 (i, v_u32 (512));
100 struct entry e = lookup (i);
102 float32x4_t r = vsubq_f32 (z, shift);
104 /* erf(x) ~ erf(r) + scale * d * (1 - r * d - 1/3 * d^2). */
105 float32x4_t d = vsubq_f32 (a, r);
106 float32x4_t d2 = vmulq_f32 (d, d);
107 float32x4_t y = vfmaq_f32 (r, dat->third, d);
108 y = vfmaq_f32 (e.erf, e.scale, vfmsq_f32 (d, d2, y));
110 /* Solves the |x| = inf case. */
111 y = vbslq_f32 (a_gt_max, v_f32 (1.0f), y);
113 /* Copy sign. */
114 y = vbslq_f32 (v_u32 (AbsMask), y, x);
116 #if WANT_SIMD_EXCEPT
117 if (__glibc_unlikely (v_any_u32 (cmp)))
118 return vbslq_f32 (cmp, vfmaq_f32 (xm, dat->scale_minus_one, xm), y);
119 #endif
120 return y;
122 libmvec_hidden_def (V_NAME_F1 (erf))
123 HALF_WIDTH_ALIAS_F1 (erf)