sysdeps/aarch64/fpu/log10f_sve.c

   1 /* Single-precision vector (SVE) log10 function
   2
   3    Copyright (C) 2023-2024 Free Software Foundation, Inc.
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 #include "sv_math.h"
  21
  22 static const struct data
  23 {
  24   float poly_0246[4];
  25   float poly_1357[4];
  26   float ln2, inv_ln10;
  27 } data = {
  28   .poly_1357 = {
  29     /* Coefficients copied from the AdvSIMD routine, then rearranged so that coeffs
  30        1, 3, 5 and 7 can be loaded as a single quad-word, hence used with _lane
  31        variant of MLA intrinsic.  */
  32     0x1.2879c8p-3f, 0x1.6408f8p-4f, 0x1.f0e514p-5f, 0x1.f5f76ap-5f
  33   },
  34   .poly_0246 = { -0x1.bcb79cp-3f, -0x1.bcd472p-4f, -0x1.246f8p-4f,
  35                  -0x1.0fc92cp-4f },
  36   .ln2 = 0x1.62e43p-1f,
  37   .inv_ln10 = 0x1.bcb7b2p-2f,
  38 };
  39
  40 #define Min 0x00800000
  41 #define Max 0x7f800000
  42 #define Thres 0x7f000000  /* Max - Min.  */
  43 #define Offset 0x3f2aaaab /* 0.666667.  */
  44 #define MantissaMask 0x007fffff
  45
  46 static svfloat32_t NOINLINE
  47 special_case (svfloat32_t x, svfloat32_t y, svbool_t special)
  48 {
  49   return sv_call_f32 (log10f, x, y, special);
  50 }
  51
  52 /* Optimised implementation of SVE log10f using the same algorithm and
  53    polynomial as AdvSIMD log10f.
  54    Maximum error is 3.31ulps:
  55    SV_NAME_F1 (log10)(0x1.555c16p+0) got 0x1.ffe2fap-4
  56                                     want 0x1.ffe2f4p-4.  */
  57 svfloat32_t SV_NAME_F1 (log10) (svfloat32_t x, const svbool_t pg)
  58 {
  59   const struct data *d = ptr_barrier (&data);
  60   svuint32_t ix = svreinterpret_u32 (x);
  61   svbool_t special = svcmpge (pg, svsub_x (pg, ix, Min), Thres);
  62
  63   /* x = 2^n * (1+r), where 2/3 < 1+r < 4/3.  */
  64   ix = svsub_x (pg, ix, Offset);
  65   svfloat32_t n = svcvt_f32_x (
  66       pg, svasr_x (pg, svreinterpret_s32 (ix), 23)); /* signextend.  */
  67   ix = svand_x (pg, ix, MantissaMask);
  68   ix = svadd_x (pg, ix, Offset);
  69   svfloat32_t r = svsub_x (pg, svreinterpret_f32 (ix), 1.0f);
  70
  71   /* y = log10(1+r) + n*log10(2)
  72      log10(1+r) ~ r * InvLn(10) + P(r)
  73      where P(r) is a polynomial. Use order 9 for log10(1+x), i.e. order 8 for
  74      log10(1+x)/x, with x in [-1/3, 1/3] (offset=2/3).  */
  75   svfloat32_t r2 = svmul_x (pg, r, r);
  76   svfloat32_t r4 = svmul_x (pg, r2, r2);
  77   svfloat32_t p_1357 = svld1rq (svptrue_b32 (), &d->poly_1357[0]);
  78   svfloat32_t q_01 = svmla_lane (sv_f32 (d->poly_0246[0]), r, p_1357, 0);
  79   svfloat32_t q_23 = svmla_lane (sv_f32 (d->poly_0246[1]), r, p_1357, 1);
  80   svfloat32_t q_45 = svmla_lane (sv_f32 (d->poly_0246[2]), r, p_1357, 2);
  81   svfloat32_t q_67 = svmla_lane (sv_f32 (d->poly_0246[3]), r, p_1357, 3);
  82   svfloat32_t q_47 = svmla_x (pg, q_45, r2, q_67);
  83   svfloat32_t q_03 = svmla_x (pg, q_01, r2, q_23);
  84   svfloat32_t y = svmla_x (pg, q_03, r4, q_47);
  85
  86   /* Using hi = Log10(2)*n + r*InvLn(10) is faster but less accurate.  */
  87   svfloat32_t hi = svmla_x (pg, r, n, d->ln2);
  88   hi = svmul_x (pg, hi, d->inv_ln10);
  89
  90   if (__glibc_unlikely (svptest_any (pg, special)))
  91     return special_case (x, svmla_x (svnot_z (pg, special), hi, r2, y),
  92                          special);
  93   return svmla_x (pg, hi, r2, y);
  94 }