1 /* Double-precision vector (AdvSIMD) exp10 function.
3 Copyright (C) 2023-2024 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
22 /* Value of |x| above which scale overflows without special treatment. */
23 #define SpecialBound 306.0 /* floor (log10 (2^1023)) - 1. */
24 /* Value of n above which scale overflows even with special treatment. */
25 #define ScaleBound 163840.0 /* 1280.0 * N. */
27 const static struct data
30 float64x2_t log10_2
, log2_10_hi
, log2_10_lo
, shift
;
32 float64x2_t special_bound
, scale_thresh
;
35 /* Coefficients generated using Remez algorithm.
36 rel error: 0x1.5ddf8f28p-54
37 abs error: 0x1.5ed266c8p-54 in [ -log10(2)/256, log10(2)/256 ]
38 maxerr: 1.14432 +0.5 ulp. */
39 .poly
= { V2 (0x1.26bb1bbb5524p1
), V2 (0x1.53524c73cecdap1
),
40 V2 (0x1.047060efb781cp1
), V2 (0x1.2bd76040f0d16p0
) },
41 .log10_2
= V2 (0x1.a934f0979a371p8
), /* N/log2(10). */
42 .log2_10_hi
= V2 (0x1.34413509f79ffp
-9), /* log2(10)/N. */
43 .log2_10_lo
= V2 (-0x1.9dc1da994fd21p
-66),
44 .shift
= V2 (0x1.8p
+52),
46 .scale_thresh
= V2 (ScaleBound
),
47 .special_bound
= V2 (SpecialBound
),
51 #define N (1 << V_EXP_TABLE_BITS)
52 #define IndexMask v_u64 (N - 1)
56 # define TinyBound v_u64 (0x2000000000000000) /* asuint64 (0x1p-511). */
57 # define BigBound v_u64 (0x4070000000000000) /* asuint64 (0x1p8). */
58 # define Thres v_u64 (0x2070000000000000) /* BigBound - TinyBound. */
60 static float64x2_t VPCS_ATTR NOINLINE
61 special_case (float64x2_t x
, float64x2_t y
, uint64x2_t cmp
)
63 /* If fenv exceptions are to be triggered correctly, fall back to the scalar
64 routine for special lanes. */
65 return v_call_f64 (exp10
, x
, y
, cmp
);
70 # define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */
71 /* SpecialBias1 + SpecialBias1 = asuint(1.0). */
72 # define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
73 # define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
75 static inline float64x2_t VPCS_ATTR
76 special_case (float64x2_t s
, float64x2_t y
, float64x2_t n
,
79 /* 2^(n/N) may overflow, break it up into s1*s2. */
80 uint64x2_t b
= vandq_u64 (vcltzq_f64 (n
), SpecialOffset
);
81 float64x2_t s1
= vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1
, b
));
82 float64x2_t s2
= vreinterpretq_f64_u64 (
83 vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s
), SpecialBias2
), b
));
84 uint64x2_t cmp
= vcagtq_f64 (n
, d
->scale_thresh
);
85 float64x2_t r1
= vmulq_f64 (s1
, s1
);
86 float64x2_t r0
= vmulq_f64 (vfmaq_f64 (s2
, y
, s2
), s1
);
87 return vbslq_f64 (cmp
, r1
, r0
);
92 /* Fast vector implementation of exp10.
93 Maximum measured error is 1.64 ulp.
94 _ZGVnN2v_exp10(0x1.ccd1c9d82cc8cp+0) got 0x1.f8dab6d7fed0cp+5
95 want 0x1.f8dab6d7fed0ap+5. */
96 float64x2_t VPCS_ATTR
V_NAME_D1 (exp10
) (float64x2_t x
)
98 const struct data
*d
= ptr_barrier (&data
);
101 /* If any lanes are special, mask them with 1 and retain a copy of x to allow
102 special_case to fix special lanes later. This is only necessary if fenv
103 exceptions are to be triggered correctly. */
105 uint64x2_t iax
= vreinterpretq_u64_f64 (vabsq_f64 (x
));
106 cmp
= vcgeq_u64 (vsubq_u64 (iax
, TinyBound
), Thres
);
107 if (__glibc_unlikely (v_any_u64 (cmp
)))
108 x
= vbslq_f64 (cmp
, v_f64 (1), x
);
110 cmp
= vcageq_f64 (x
, d
->special_bound
);
113 /* n = round(x/(log10(2)/N)). */
114 float64x2_t z
= vfmaq_f64 (d
->shift
, x
, d
->log10_2
);
115 uint64x2_t u
= vreinterpretq_u64_f64 (z
);
116 float64x2_t n
= vsubq_f64 (z
, d
->shift
);
118 /* r = x - n*log10(2)/N. */
120 r
= vfmsq_f64 (r
, d
->log2_10_hi
, n
);
121 r
= vfmsq_f64 (r
, d
->log2_10_lo
, n
);
123 uint64x2_t e
= vshlq_n_u64 (u
, 52 - V_EXP_TABLE_BITS
);
124 uint64x2_t i
= vandq_u64 (u
, IndexMask
);
126 /* y = exp10(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */
127 float64x2_t r2
= vmulq_f64 (r
, r
);
128 float64x2_t p
= vfmaq_f64 (d
->poly
[0], r
, d
->poly
[1]);
129 float64x2_t y
= vfmaq_f64 (d
->poly
[2], r
, d
->poly
[3]);
130 p
= vfmaq_f64 (p
, y
, r2
);
131 y
= vmulq_f64 (r
, p
);
134 u
= v_lookup_u64 (__v_exp_data
, i
);
135 float64x2_t s
= vreinterpretq_f64_u64 (vaddq_u64 (u
, e
));
137 if (__glibc_unlikely (v_any_u64 (cmp
)))
139 return special_case (xm
, vfmaq_f64 (s
, y
, s
), cmp
);
141 return special_case (s
, y
, n
, d
);
144 return vfmaq_f64 (s
, y
, s
);