sysdeps/x86_64/fpu/multiarch/svml_s_atanhf4_core_sse4.S

   1 /* Function atanhf vectorized with SSE4.
   2    Copyright (C) 2021-2023 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    https://www.gnu.org/licenses/.  */
  18
  19 /*
  20  * ALGORITHM DESCRIPTION:
  21  *
  22  *   Compute atanh(x) as 0.5 * log((1 + x)/(1 - x))
  23  *
  24  *   Special cases:
  25  *
  26  *   atanh(0)  = 0
  27  *   atanh(+1) = +INF
  28  *   atanh(-1) = -INF
  29  *   atanh(x)  = NaN if |x| > 1, or if x is a NaN or INF
  30  *
  31  */
  32
  33 /* Offsets for data table __svml_satanh_data_internal_avx512. Ordered
  34    by use in the function. On cold-starts this might help the
  35    prefetcher. Possibly a better idea is to interleave start/end so
  36    that the prefetcher is less likely to detect a stream and pull
  37    irrelivant lines into cache.  */
  38 #define sOne                            0
  39 #define SgnMask                         16
  40 #define sTopMask12                      32
  41 #define iBrkValue                       48
  42 #define iOffExpoMask                    64
  43 #define sPoly                           80
  44 #define sLn2                            208
  45 #define TinyRange                       224
  46
  47 #include <sysdep.h>
  48 #define ATANHF_DATA(x)                  ((x)+__svml_satanh_data_internal)
  49
  50         .section .text.sse4, "ax", @progbits
  51 ENTRY(_ZGVbN4v_atanhf_sse4)
  52         movaps  %xmm0, %xmm5
  53
  54         /* Load constants including One = 1 */
  55         movups  ATANHF_DATA(sOne)(%rip), %xmm4
  56         movaps  %xmm5, %xmm3
  57
  58         /* Strip off the sign, so treat X as positive until right at the end */
  59         movups  ATANHF_DATA(SgnMask)(%rip), %xmm1
  60         movaps  %xmm4, %xmm2
  61         andps   %xmm1, %xmm0
  62         movaps  %xmm4, %xmm10
  63         movups  ATANHF_DATA(sTopMask12)(%rip), %xmm11
  64         movaps  %xmm4, %xmm14
  65         movaps  %xmm11, %xmm9
  66
  67
  68         /*
  69          * Compute V = 2 * X trivially, and UHi + U_lo = 1 - X in two pieces,
  70          * the upper part UHi being <= 12 bits long. Then we have
  71          * atanh(X) = 1/2 * log((1 + X) / (1 - X)) = 1/2 * log1p(V / (UHi + ULo)).
  72          */
  73         movaps  %xmm0, %xmm6
  74         mulps   %xmm5, %xmm3
  75         subps   %xmm0, %xmm2
  76         addps   %xmm0, %xmm6
  77         subps   %xmm2, %xmm10
  78         addps   %xmm5, %xmm3
  79         subps   %xmm0, %xmm10
  80         andps   %xmm2, %xmm9
  81
  82
  83         /*
  84          * Check whether |X| < 1, in which case we use the main function.
  85          * Otherwise set the rangemask so that the callout will get used.
  86          * Note that this will also use the callout for NaNs since not(NaN < 1).
  87          */
  88         rcpps   %xmm9, %xmm7
  89         subps   %xmm9, %xmm2
  90         andps   %xmm11, %xmm7
  91
  92
  93         /*
  94          * Split V as well into upper 12 bits and lower part, so that we can get
  95          * a preliminary quotient estimate without rounding error.
  96          */
  97         andps   %xmm6, %xmm11
  98         mulps   %xmm7, %xmm9
  99         addps   %xmm2, %xmm10
 100         subps   %xmm11, %xmm6
 101
 102         /* Hence get initial quotient estimate QHi + QLo = R * VHi + R * VLo */
 103         mulps   %xmm7, %xmm11
 104         mulps   %xmm7, %xmm10
 105         subps   %xmm9, %xmm14
 106         mulps   %xmm6, %xmm7
 107         subps   %xmm10, %xmm14
 108
 109         /* Compute D = E + E^2 */
 110         movaps  %xmm14, %xmm13
 111         movaps  %xmm4, %xmm8
 112         mulps   %xmm14, %xmm13
 113
 114         /* reduction: compute r,n */
 115         movdqu  ATANHF_DATA(iBrkValue)(%rip), %xmm9
 116         addps   %xmm13, %xmm14
 117
 118         /*
 119          * Compute R * (VHi + VLo) * (1 + E + E^2)
 120          * = R *  (VHi + VLo) * (1 + D)
 121          * = QHi + (QHi * D + QLo + QLo * D)
 122          */
 123         movaps  %xmm14, %xmm2
 124         mulps   %xmm7, %xmm14
 125         mulps   %xmm11, %xmm2
 126         addps   %xmm14, %xmm7
 127         movdqu  ATANHF_DATA(iOffExpoMask)(%rip), %xmm12
 128         movaps  %xmm4, %xmm14
 129
 130         /* Record the sign for eventual reincorporation. */
 131         addps   %xmm7, %xmm2
 132
 133
 134         /*
 135          * Now finally accumulate the high and low parts of the
 136          * argument to log1p, H + L, with a final compensated summation.
 137          */
 138         movaps  %xmm2, %xmm6
 139         andnps  %xmm5, %xmm1
 140         movaps  %xmm4, %xmm7
 141         /* Or the sign bit in with the tiny result to handle atanh(-0) correctly */
 142         addps   %xmm11, %xmm6
 143         maxps   %xmm6, %xmm7
 144         minps   %xmm6, %xmm8
 145         subps   %xmm6, %xmm11
 146         movaps  %xmm7, %xmm10
 147         addps   %xmm8, %xmm10
 148         addps   %xmm11, %xmm2
 149         subps   %xmm10, %xmm7
 150         psubd   %xmm9, %xmm10
 151         addps   %xmm8, %xmm7
 152         pand    %xmm10, %xmm12
 153         psrad   $23, %xmm10
 154         cvtdq2ps %xmm10, %xmm13
 155         addps   %xmm7, %xmm2
 156
 157         /* final reconstruction */
 158         pslld   $23, %xmm10
 159         paddd   %xmm9, %xmm12
 160         psubd   %xmm10, %xmm14
 161
 162         /* polynomial evaluation */
 163         subps   %xmm4, %xmm12
 164         mulps   %xmm14, %xmm2
 165         movups  ATANHF_DATA(sPoly+0)(%rip), %xmm7
 166         addps   %xmm12, %xmm2
 167         mulps   %xmm2, %xmm7
 168
 169
 170         /* Finally, halve the result and reincorporate the sign */
 171         addps   ATANHF_DATA(sPoly+16)(%rip), %xmm7
 172         mulps   %xmm2, %xmm7
 173         addps   ATANHF_DATA(sPoly+32)(%rip), %xmm7
 174         mulps   %xmm2, %xmm7
 175         addps   ATANHF_DATA(sPoly+48)(%rip), %xmm7
 176         mulps   %xmm2, %xmm7
 177         addps   ATANHF_DATA(sPoly+64)(%rip), %xmm7
 178         mulps   %xmm2, %xmm7
 179         addps   ATANHF_DATA(sPoly+80)(%rip), %xmm7
 180         mulps   %xmm2, %xmm7
 181         addps   ATANHF_DATA(sPoly+96)(%rip), %xmm7
 182         mulps   %xmm2, %xmm7
 183         movaps  ATANHF_DATA(sPoly+112)(%rip), %xmm6
 184         addps   %xmm6, %xmm7
 185         mulps   %xmm2, %xmm7
 186         mulps   %xmm2, %xmm7
 187         mulps   ATANHF_DATA(sLn2)(%rip), %xmm13
 188         /* We can build `sHalf` with `sPoly & sOne`.  */
 189         andps   %xmm4, %xmm6
 190         orps    %xmm1, %xmm3
 191         xorps   %xmm6, %xmm1
 192
 193         addps   %xmm2, %xmm7
 194         addps   %xmm13, %xmm7
 195         mulps   %xmm7, %xmm1
 196
 197         /* Finish check of NaNs.  */
 198         cmpleps %xmm0, %xmm4
 199         movmskps %xmm4, %edx
 200         cmpltps ATANHF_DATA(TinyRange)(%rip), %xmm0
 201
 202         andps   %xmm0, %xmm3
 203         andnps  %xmm1, %xmm0
 204         orps    %xmm3, %xmm0
 205
 206         testl   %edx, %edx
 207         /* Go to special inputs processing branch.  */
 208         jne     L(SPECIAL_VALUES_BRANCH)
 209         # LOE rbx rbp r12 r13 r14 r15 xmm0
 210         /* No registers to restore on fast path.  */
 211         ret
 212
 213
 214         /* Cold case. edx has 1s where there was a special value that
 215            needs to be handled by a atanhf call. Optimize for code size
 216            more so than speed here. */
 217 L(SPECIAL_VALUES_BRANCH):
 218         # LOE rbx rdx rbp r12 r13 r14 r15 xmm0 xmm5
 219         /* Stack coming in 16-byte aligned. Set 8-byte misaligned so on
 220        call entry will be 16-byte aligned. */
 221         subq    $56, %rsp
 222         cfi_def_cfa_offset(64)
 223         movups  %xmm0, 24(%rsp)
 224         movups  %xmm5, 40(%rsp)
 225
 226         /* Use rbx/rbp for callee save registers as they get short
 227        encoding for many instructions (as compared with r12/r13). */
 228         movq    %rbx, (%rsp)
 229         cfi_offset(rbx, -64)
 230         movq    %rbp, 8(%rsp)
 231         cfi_offset(rbp, -56)
 232         /* edx has 1s where there was a special value that needs to be handled
 233            by a tanhf call.  */
 234         movl    %edx, %ebx
 235 L(SPECIAL_VALUES_LOOP):
 236         # LOE rbx rbp r12 r13 r14 r15
 237         /* use rbp as index for special value that is saved across calls to
 238            tanhf. We technically don't need a callee save register here as offset
 239            to rsp is always [0, 12] so we can restore rsp by realigning to 64.
 240            Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
 241            in the loop.  */
 242         xorl    %ebp, %ebp
 243         bsfl    %ebx, %ebp
 244
 245         /* Scalar math fucntion call to process special input.  */
 246         movss   40(%rsp, %rbp, 4), %xmm0
 247         call    atanhf@PLT
 248         /* No good way to avoid the store-forwarding fault this will cause on
 249            return. `lfence` avoids the SF fault but at greater cost as it
 250            serialized stack/callee save restoration.  */
 251         movss   %xmm0, 24(%rsp, %rbp, 4)
 252
 253         leal    -1(%rbx), %eax
 254         andl    %eax, %ebx
 255         jnz     L(SPECIAL_VALUES_LOOP)
 256         # LOE r12 r13 r14 r15
 257         /* All results have been written to 24(%rsp).  */
 258         movups  24(%rsp), %xmm0
 259         movq    (%rsp), %rbx
 260         cfi_restore(rbx)
 261         movq    8(%rsp), %rbp
 262         cfi_restore(rbp)
 263         addq    $56, %rsp
 264         cfi_def_cfa_offset(8)
 265         ret
 266 END(_ZGVbN4v_atanhf_sse4)
 267
 268         .section .rodata, "a"
 269         .align  16
 270
 271 #ifdef __svml_satanh_data_internal_typedef
 272 typedef unsigned int VUINT32;
 273 typedef struct{
 274         __declspec(align(16)) VUINT32 sOne[4][1];
 275         __declspec(align(16)) VUINT32 SgnMask[4][1];
 276         __declspec(align(16)) VUINT32 sTopMask12[4][1];
 277         __declspec(align(16)) VUINT32 iBrkValue[4][1];
 278         __declspec(align(16)) VUINT32 iOffExpoMask[4][1];
 279         __declspec(align(16)) VUINT32 sPoly[8][4][1];
 280         __declspec(align(16)) VUINT32 sLn2[4][1];
 281         __declspec(align(16)) VUINT32 TinyRange[4][1];
 282 } __svml_satanh_data_internal;
 283 #endif
 284
 285 __svml_satanh_data_internal:
 286         /* sOne = SP 1.0 */
 287         .align  16
 288         .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
 289         /* SgnMask */
 290         .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
 291         /* sTopMask12 */
 292         .align  16
 293         .long   0xFFFFF000, 0xFFFFF000, 0xFFFFF000, 0xFFFFF000
 294         /* iBrkValue = SP 2/3 */
 295         .align  16
 296         .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
 297         /* iOffExpoMask = SP significand mask ==*/
 298         .align  16
 299         .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
 300
 301         /* sPoly[] = SP polynomial */
 302         .align  16
 303         .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
 304         .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
 305         .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
 306         .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
 307         .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
 308         .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
 309         .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
 310         .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
 311
 312         /* sLn2 = SP ln(2) */
 313         .align  16
 314         .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
 315         /* TinyRange */
 316         .align  16
 317         .long   0x0C000000, 0x0C000000, 0x0C000000, 0x0C000000
 318         .align  16
 319         .type   __svml_satanh_data_internal, @object
 320         .size   __svml_satanh_data_internal, .-__svml_satanh_data_internal