sysdeps/aarch64/fpu/erff_advsimd.c

   1 /* Single-precision vector (Advanced SIMD) erf function
   2
   3    Copyright (C) 2024 Free Software Foundation, Inc.
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 #include "v_math.h"
  21
  22 static const struct data
  23 {
  24   float32x4_t max, shift, third;
  25 #if WANT_SIMD_EXCEPT
  26   float32x4_t tiny_bound, scale_minus_one;
  27 #endif
  28 } data = {
  29   .max = V4 (3.9375), /* 4 - 8/128.  */
  30   .shift = V4 (0x1p16f),
  31   .third = V4 (0x1.555556p-2f), /* 1/3.  */
  32 #if WANT_SIMD_EXCEPT
  33   .tiny_bound = V4 (0x1p-62f),
  34   .scale_minus_one = V4 (0x1.06eba8p-3f), /* scale - 1.0.  */
  35 #endif
  36 };
  37
  38 #define AbsMask 0x7fffffff
  39
  40 struct entry
  41 {
  42   float32x4_t erf;
  43   float32x4_t scale;
  44 };
  45
  46 static inline struct entry
  47 lookup (uint32x4_t i)
  48 {
  49   struct entry e;
  50   float64_t t0 = *((float64_t *) (__erff_data.tab + i[0]));
  51   float64_t t1 = *((float64_t *) (__erff_data.tab + i[1]));
  52   float64_t t2 = *((float64_t *) (__erff_data.tab + i[2]));
  53   float64_t t3 = *((float64_t *) (__erff_data.tab + i[3]));
  54   float32x4_t e1 = vreinterpretq_f32_f64 ((float64x2_t){ t0, t1 });
  55   float32x4_t e2 = vreinterpretq_f32_f64 ((float64x2_t){ t2, t3 });
  56   e.erf = vuzp1q_f32 (e1, e2);
  57   e.scale = vuzp2q_f32 (e1, e2);
  58   return e;
  59 }
  60
  61 /* Single-precision implementation of vector erf(x).
  62    Approximation based on series expansion near x rounded to
  63    nearest multiple of 1/128.
  64    Let d = x - r, and scale = 2 / sqrt(pi) * exp(-r^2). For x near r,
  65
  66    erf(x) ~ erf(r) + scale * d * [1 - r * d - 1/3 * d^2]
  67
  68    Values of erf(r) and scale are read from lookup tables.
  69    For |x| > 3.9375, erf(|x|) rounds to 1.0f.
  70
  71    Maximum error: 1.93 ULP
  72      _ZGVnN4v_erff(0x1.c373e6p-9) got 0x1.fd686cp-9
  73                                  want 0x1.fd6868p-9.  */
  74 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (erf) (float32x4_t x)
  75 {
  76   const struct data *dat = ptr_barrier (&data);
  77
  78 #if WANT_SIMD_EXCEPT
  79   /* |x| < 2^-62.  */
  80   uint32x4_t cmp = vcaltq_f32 (x, dat->tiny_bound);
  81   float32x4_t xm = x;
  82   /* If any lanes are special, mask them with 1 and retain a copy of x to allow
  83      special case handler to fix special lanes later. This is only necessary if
  84      fenv exceptions are to be triggered correctly.  */
  85   if (__glibc_unlikely (v_any_u32 (cmp)))
  86     x = vbslq_f32 (cmp, v_f32 (1), x);
  87 #endif
  88
  89   float32x4_t a = vabsq_f32 (x);
  90   uint32x4_t a_gt_max = vcgtq_f32 (a, dat->max);
  91
  92   /* Lookup erf(r) and scale(r) in tables, e.g. set erf(r) to 0 and scale to
  93      2/sqrt(pi), when x reduced to r = 0.  */
  94   float32x4_t shift = dat->shift;
  95   float32x4_t z = vaddq_f32 (a, shift);
  96
  97   uint32x4_t i
  98       = vsubq_u32 (vreinterpretq_u32_f32 (z), vreinterpretq_u32_f32 (shift));
  99   i = vminq_u32 (i, v_u32 (512));
 100   struct entry e = lookup (i);
 101
 102   float32x4_t r = vsubq_f32 (z, shift);
 103
 104   /* erf(x) ~ erf(r) + scale * d * (1 - r * d - 1/3 * d^2).  */
 105   float32x4_t d = vsubq_f32 (a, r);
 106   float32x4_t d2 = vmulq_f32 (d, d);
 107   float32x4_t y = vfmaq_f32 (r, dat->third, d);
 108   y = vfmaq_f32 (e.erf, e.scale, vfmsq_f32 (d, d2, y));
 109
 110   /* Solves the |x| = inf case.  */
 111   y = vbslq_f32 (a_gt_max, v_f32 (1.0f), y);
 112
 113   /* Copy sign.  */
 114   y = vbslq_f32 (v_u32 (AbsMask), y, x);
 115
 116 #if WANT_SIMD_EXCEPT
 117   if (__glibc_unlikely (v_any_u32 (cmp)))
 118     return vbslq_f32 (cmp, vfmaq_f32 (xm, dat->scale_minus_one, xm), y);
 119 #endif
 120   return y;
 121 }
 122 libmvec_hidden_def (V_NAME_F1 (erf))
 123 HALF_WIDTH_ALIAS_F1 (erf)