sysdeps/aarch64/fpu/expm1f_sve.c

   1 /* Single-precision SVE expm1
   2
   3    Copyright (C) 2023-2024 Free Software Foundation, Inc.
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 #include "sv_math.h"
  21 #include "poly_sve_f32.h"
  22
  23 /* Largest value of x for which expm1(x) should round to -1.  */
  24 #define SpecialBound 0x1.5ebc4p+6f
  25
  26 static const struct data
  27 {
  28   /* These 4 are grouped together so they can be loaded as one quadword, then
  29      used with _lane forms of svmla/svmls.  */
  30   float c2, c4, ln2_hi, ln2_lo;
  31   float c0, c1, c3, inv_ln2, special_bound, shift;
  32 } data = {
  33   /* Generated using fpminimax.  */
  34   .c0 = 0x1.fffffep-2,           .c1 = 0x1.5554aep-3,
  35   .c2 = 0x1.555736p-5,           .c3 = 0x1.12287cp-7,
  36   .c4 = 0x1.6b55a2p-10,
  37
  38   .special_bound = SpecialBound, .shift = 0x1.8p23f,
  39   .inv_ln2 = 0x1.715476p+0f,     .ln2_hi = 0x1.62e4p-1f,
  40   .ln2_lo = 0x1.7f7d1cp-20f,
  41 };
  42
  43 #define C(i) sv_f32 (d->c##i)
  44
  45 static svfloat32_t NOINLINE
  46 special_case (svfloat32_t x, svbool_t pg)
  47 {
  48   return sv_call_f32 (expm1f, x, x, pg);
  49 }
  50
  51 /* Single-precision SVE exp(x) - 1. Maximum error is 1.52 ULP:
  52    _ZGVsMxv_expm1f(0x1.8f4ebcp-2) got 0x1.e859dp-2
  53                                  want 0x1.e859d4p-2.  */
  54 svfloat32_t SV_NAME_F1 (expm1) (svfloat32_t x, svbool_t pg)
  55 {
  56   const struct data *d = ptr_barrier (&data);
  57
  58   /* Large, NaN/Inf.  */
  59   svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound));
  60
  61   if (__glibc_unlikely (svptest_any (pg, special)))
  62     return special_case (x, pg);
  63
  64   /* This vector is reliant on layout of data - it contains constants
  65      that can be used with _lane forms of svmla/svmls. Values are:
  66      [ coeff_2, coeff_4, ln2_hi, ln2_lo ].  */
  67   svfloat32_t lane_constants = svld1rq (svptrue_b32 (), &d->c2);
  68
  69   /* Reduce argument to smaller range:
  70      Let i = round(x / ln2)
  71      and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
  72      exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
  73      where 2^i is exact because i is an integer.  */
  74   svfloat32_t j = svmla_x (pg, sv_f32 (d->shift), x, d->inv_ln2);
  75   j = svsub_x (pg, j, d->shift);
  76   svint32_t i = svcvt_s32_x (pg, j);
  77
  78   svfloat32_t f = svmls_lane (x, j, lane_constants, 2);
  79   f = svmls_lane (f, j, lane_constants, 3);
  80
  81   /* Approximate expm1(f) using polynomial.
  82      Taylor expansion for expm1(x) has the form:
  83          x + ax^2 + bx^3 + cx^4 ....
  84      So we calculate the polynomial P(f) = a + bf + cf^2 + ...
  85      and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
  86   svfloat32_t p12 = svmla_lane (C (1), f, lane_constants, 0);
  87   svfloat32_t p34 = svmla_lane (C (3), f, lane_constants, 1);
  88   svfloat32_t f2 = svmul_x (pg, f, f);
  89   svfloat32_t p = svmla_x (pg, p12, f2, p34);
  90   p = svmla_x (pg, C (0), f, p);
  91   p = svmla_x (pg, f, f2, p);
  92
  93   /* Assemble the result.
  94      expm1(x) ~= 2^i * (p + 1) - 1
  95      Let t = 2^i.  */
  96   svfloat32_t t = svreinterpret_f32 (
  97       svadd_x (pg, svreinterpret_u32 (svlsl_x (pg, i, 23)), 0x3f800000));
  98   return svmla_x (pg, svsub_x (pg, t, 1), p, t);
  99 }