1 /* Single-precision vector (Advanced SIMD) erf function
3 Copyright (C) 2024 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
22 static const struct data
24 float32x4_t max
, shift
, third
;
26 float32x4_t tiny_bound
, scale_minus_one
;
29 .max
= V4 (3.9375), /* 4 - 8/128. */
30 .shift
= V4 (0x1p
16f
),
31 .third
= V4 (0x1.555556p
-2f
), /* 1/3. */
33 .tiny_bound
= V4 (0x1p
-62f
),
34 .scale_minus_one
= V4 (0x1.06eba8p
-3f
), /* scale - 1.0. */
38 #define AbsMask 0x7fffffff
46 static inline struct entry
50 float64_t t0
= *((float64_t
*) (__erff_data
.tab
+ i
[0]));
51 float64_t t1
= *((float64_t
*) (__erff_data
.tab
+ i
[1]));
52 float64_t t2
= *((float64_t
*) (__erff_data
.tab
+ i
[2]));
53 float64_t t3
= *((float64_t
*) (__erff_data
.tab
+ i
[3]));
54 float32x4_t e1
= vreinterpretq_f32_f64 ((float64x2_t
){ t0
, t1
});
55 float32x4_t e2
= vreinterpretq_f32_f64 ((float64x2_t
){ t2
, t3
});
56 e
.erf
= vuzp1q_f32 (e1
, e2
);
57 e
.scale
= vuzp2q_f32 (e1
, e2
);
61 /* Single-precision implementation of vector erf(x).
62 Approximation based on series expansion near x rounded to
63 nearest multiple of 1/128.
64 Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
66 erf(x) ~ erf(r) + scale * d * [1 - r * d - 1/3 * d^2]
68 Values of erf(r) and scale are read from lookup tables.
69 For |x| > 3.9375, erf(|x|) rounds to 1.0f.
71 Maximum error: 1.93 ULP
72 _ZGVnN4v_erff(0x1.c373e6p-9) got 0x1.fd686cp-9
73 want 0x1.fd6868p-9. */
74 float32x4_t VPCS_ATTR NOINLINE
V_NAME_F1 (erf
) (float32x4_t x
)
76 const struct data
*dat
= ptr_barrier (&data
);
80 uint32x4_t cmp
= vcaltq_f32 (x
, dat
->tiny_bound
);
82 /* If any lanes are special, mask them with 1 and retain a copy of x to allow
83 special case handler to fix special lanes later. This is only necessary if
84 fenv exceptions are to be triggered correctly. */
85 if (__glibc_unlikely (v_any_u32 (cmp
)))
86 x
= vbslq_f32 (cmp
, v_f32 (1), x
);
89 float32x4_t a
= vabsq_f32 (x
);
90 uint32x4_t a_gt_max
= vcgtq_f32 (a
, dat
->max
);
92 /* Lookup erf(r) and scale(r) in tables, e.g. set erf(r) to 0 and scale to
93 2/sqrt(pi), when x reduced to r = 0. */
94 float32x4_t shift
= dat
->shift
;
95 float32x4_t z
= vaddq_f32 (a
, shift
);
98 = vsubq_u32 (vreinterpretq_u32_f32 (z
), vreinterpretq_u32_f32 (shift
));
99 i
= vminq_u32 (i
, v_u32 (512));
100 struct entry e
= lookup (i
);
102 float32x4_t r
= vsubq_f32 (z
, shift
);
104 /* erf(x) ~ erf(r) + scale * d * (1 - r * d - 1/3 * d^2). */
105 float32x4_t d
= vsubq_f32 (a
, r
);
106 float32x4_t d2
= vmulq_f32 (d
, d
);
107 float32x4_t y
= vfmaq_f32 (r
, dat
->third
, d
);
108 y
= vfmaq_f32 (e
.erf
, e
.scale
, vfmsq_f32 (d
, d2
, y
));
110 /* Solves the |x| = inf case. */
111 y
= vbslq_f32 (a_gt_max
, v_f32 (1.0f
), y
);
114 y
= vbslq_f32 (v_u32 (AbsMask
), y
, x
);
117 if (__glibc_unlikely (v_any_u32 (cmp
)))
118 return vbslq_f32 (cmp
, vfmaq_f32 (xm
, dat
->scale_minus_one
, xm
), y
);
122 libmvec_hidden_def (V_NAME_F1 (erf
))
123 HALF_WIDTH_ALIAS_F1 (erf
)