sysdeps/x86_64/fpu/multiarch/svml_s_asinhf4_core_sse4.S

   1 /* Function asinhf vectorized with SSE4.
   2    Copyright (C) 2021-2023 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    https://www.gnu.org/licenses/.  */
  18
  19 /*
  20  * ALGORITHM DESCRIPTION:
  21  *
  22  *   Compute asinh(x) as log(x + sqrt(x*x + 1))
  23  *
  24  *   Special cases:
  25  *
  26  *   asinh(NaN) = quiet NaN, and raise invalid exception
  27  *   asinh(INF) = that INF
  28  *   asinh(0)   = that 0
  29  *
  30  */
  31
  32 /* Offsets for data table __svml_sasinh_data_internal
  33  */
  34 #define SgnMask                         0
  35 #define sOne                            16
  36 #define sPoly                           32
  37 #define iBrkValue                       160
  38 #define iOffExpoMask                    176
  39 #define sBigThreshold                   192
  40 #define sC2                             208
  41 #define sC3                             224
  42 #define sHalf                           240
  43 #define sLargestFinite                  256
  44 #define sLittleThreshold                272
  45 #define sSign                           288
  46 #define sThirtyOne                      304
  47 #define sTopMask11                      320
  48 #define sTopMask8                       336
  49 #define XScale                          352
  50 #define sLn2                            368
  51
  52 #include <sysdep.h>
  53
  54         .section .text.sse4, "ax", @progbits
  55 ENTRY(_ZGVbN4v_asinhf_sse4)
  56         subq    $72, %rsp
  57         cfi_def_cfa_offset(80)
  58         movaps  %xmm0, %xmm8
  59
  60         /*
  61          * Split X into high and low parts, XHi (<= 11 bits) and XLo (<= 13 bits)
  62          * We could use either X or |X| here, but it doesn't seem to matter
  63          */
  64         movups  sTopMask11+__svml_sasinh_data_internal(%rip), %xmm10
  65         movaps  %xmm8, %xmm2
  66         andps   %xmm8, %xmm10
  67
  68         /*
  69          * Compute X^2 = (XHi + XLo)^2 = XHi^2 + XLo * (X + XHi)
  70          * The two parts are shifted off by around 11 bits. So even though
  71          * the low bit will not in general be exact, it's near enough
  72          */
  73         movaps  %xmm10, %xmm3
  74         subps   %xmm10, %xmm2
  75         mulps   %xmm10, %xmm3
  76         addps   %xmm8, %xmm10
  77
  78         /* Load the constant 1 and a sign mask */
  79         movups  sOne+__svml_sasinh_data_internal(%rip), %xmm7
  80
  81         /*
  82          * Finally, express Y + W = X^2 + 1 accurately where Y has <= 8 bits.
  83          * If |X| <= 1 then |XHi| <= 1 and so |X2Hi| <= 1, so we can treat 1
  84          * as the dominant component in the compensated summation. Otherwise,
  85          * if |X| >= 1, then since X2Hi only has 22 significant bits, the basic
  86          * addition will be exact anyway until we get to |X| >= 2^24. But by
  87          * that time the log function is well-conditioned enough that the
  88          * rounding error doesn't matter. Hence we can treat 1 as dominant even
  89          * if it literally isn't.
  90          */
  91         movaps  %xmm7, %xmm11
  92         movaps  %xmm7, %xmm4
  93         movups  sTopMask8+__svml_sasinh_data_internal(%rip), %xmm12
  94         addps   %xmm3, %xmm11
  95         mulps   %xmm10, %xmm2
  96         subps   %xmm11, %xmm4
  97         movaps  %xmm12, %xmm0
  98         addps   %xmm3, %xmm4
  99
 100         /*
 101          * Unfortunately, we can still be in trouble if |X| <= 2^-5, since
 102          * the absolute error 2^-(7+24)-ish in sqrt(1 + X^2) gets scaled up
 103          * by 1/X and comes close to our threshold. Hence if |X| <= 2^-4,
 104          * perform an alternative computation
 105          * sqrt(1 + X^2) - 1 = X^2/2 - X^4/8 + X^6/16
 106          * X2 = X^2
 107          */
 108         addps   %xmm2, %xmm3
 109         addps   %xmm2, %xmm4
 110         andps   %xmm11, %xmm0
 111
 112         /*
 113          * Compute R = 1/sqrt(Y + W) * (1 + d)
 114          * Force R to <= 8 significant bits.
 115          * This means that R * Y and R^2 * Y are exactly representable.
 116          */
 117         rsqrtps %xmm0, %xmm14
 118         subps   %xmm0, %xmm11
 119         andps   %xmm12, %xmm14
 120         addps   %xmm11, %xmm4
 121
 122         /*
 123          * Compute S = (Y/sqrt(Y + W)) * (1 + d)
 124          * and T = (W/sqrt(Y + W)) * (1 + d)
 125          * so that S + T = sqrt(Y + W) * (1 + d)
 126          * S is exact, and the rounding error in T is OK.
 127          */
 128         mulps   %xmm14, %xmm0
 129         mulps   %xmm14, %xmm4
 130
 131         /*
 132          * Get the absolute value of the input, since we will exploit antisymmetry
 133          * and mostly assume X >= 0 in the core computation
 134          */
 135         movups  SgnMask+__svml_sasinh_data_internal(%rip), %xmm6
 136
 137         /*
 138          * Compute e = -(2 * d + d^2)
 139          * The first FMR is exact, and the rounding error in the other is acceptable
 140          * since d and e are ~ 2^-8
 141          */
 142         movaps  %xmm14, %xmm13
 143         andps   %xmm8, %xmm6
 144
 145         /*
 146          * Obtain sqrt(1 + X^2) - 1 in two pieces
 147          * sqrt(1 + X^2) - 1
 148          * = sqrt(Y + W) - 1
 149          * = (S + T) * (1 + Corr) - 1
 150          * = [S - 1] + [T + (S + T) * Corr]
 151          * We need a compensated summation for the last part. We treat S - 1
 152          * as the larger part; it certainly is until about X < 2^-4, and in that
 153          * case, the error is affordable since X dominates over sqrt(1 + X^2) - 1
 154          * Final sum is dTmp5 (hi) + dTmp7 (lo)
 155          */
 156         movaps  %xmm0, %xmm1
 157
 158         /*
 159          * Check whether the input is finite, by checking |X| <= MaxFloat
 160          * Otherwise set the rangemask so that the callout will get used.
 161          * Note that this will also use the callout for NaNs since not(NaN <= MaxFloat)
 162          */
 163         movaps  %xmm6, %xmm9
 164
 165         /*
 166          * The following computation can go wrong for very large X, basically
 167          * because X^2 overflows. But for large X we have
 168          * asinh(X) / log(2 X) - 1 =~= 1/(4 * X^2), so for X >= 2^30
 169          * we can just later stick X back into the log and tweak up the exponent.
 170          * Actually we scale X by 2^-30 and tweak the exponent up by 31,
 171          * to stay in the safe range for the later log computation.
 172          * Compute a flag now telling us when do do this.
 173          */
 174         movaps  %xmm6, %xmm5
 175         cmpnleps sLargestFinite+__svml_sasinh_data_internal(%rip), %xmm9
 176         cmpltps sBigThreshold+__svml_sasinh_data_internal(%rip), %xmm5
 177         mulps   %xmm0, %xmm13
 178         addps   %xmm4, %xmm1
 179         subps   %xmm7, %xmm0
 180         mulps   %xmm4, %xmm14
 181         movmskps %xmm9, %edx
 182         movaps  %xmm7, %xmm9
 183
 184         /*
 185          * Now       1 / (1 + d)
 186          * = 1 / (1 + (sqrt(1 - e) - 1))
 187          * = 1 / sqrt(1 - e)
 188          * = 1 + 1/2 * e + 3/8 * e^2 + 5/16 * e^3 + 35/128 * e^4 + ...
 189          * So compute the first three nonconstant terms of that, so that
 190          * we have a relative correction (1 + Corr) to apply to S etc.
 191          * C1 = 1/2
 192          * C2 = 3/8
 193          * C3 = 5/16
 194          */
 195         movups  sC3+__svml_sasinh_data_internal(%rip), %xmm15
 196         subps   %xmm13, %xmm9
 197         movups  sHalf+__svml_sasinh_data_internal(%rip), %xmm10
 198         subps   %xmm14, %xmm9
 199
 200         /* sX2over2 = X^2/2 */
 201         mulps   %xmm10, %xmm3
 202         mulps   %xmm9, %xmm15
 203
 204         /* sX46 = -X^4/4 + X^6/8 */
 205         movaps  %xmm3, %xmm2
 206         movaps  %xmm3, %xmm12
 207
 208         /*
 209          * Now do another compensated sum to add |X| + [sqrt(1 + X^2) - 1].
 210          * It's always safe to assume |X| is larger.
 211          * This is the final 2-part argument to the log1p function
 212          */
 213         movaps  %xmm6, %xmm14
 214         addps   sC2+__svml_sasinh_data_internal(%rip), %xmm15
 215         mulps   %xmm9, %xmm15
 216         addps   %xmm10, %xmm15
 217         mulps   %xmm15, %xmm9
 218         mulps   %xmm1, %xmm9
 219
 220         /* Now multiplex to the case X = 2^-30 * input, Xl = sL = 0 in the "big" case. */
 221         movups  XScale+__svml_sasinh_data_internal(%rip), %xmm15
 222         addps   %xmm9, %xmm4
 223         movaps  %xmm4, %xmm11
 224         addps   %xmm0, %xmm11
 225         subps   %xmm11, %xmm0
 226         addps   %xmm0, %xmm4
 227
 228         /* sX4over4 = X^4/4 */
 229         movaps  %xmm3, %xmm0
 230         mulps   %xmm3, %xmm0
 231         mulps   %xmm0, %xmm2
 232         subps   %xmm0, %xmm2
 233
 234         /*
 235          * Now we feed into the log1p code, using H in place of _VARG1 and
 236          * also adding L into Xl.
 237          * compute 1+x as high, low parts
 238          */
 239         movaps  %xmm7, %xmm0
 240
 241         /* sX46over2 = -X^4/8 + x^6/16 */
 242         mulps   %xmm2, %xmm10
 243         movaps  %xmm7, %xmm2
 244         addps   %xmm10, %xmm12
 245         subps   %xmm12, %xmm3
 246         addps   %xmm3, %xmm10
 247
 248         /* Now multiplex the two possible computations */
 249         movaps  %xmm6, %xmm3
 250         cmpleps sLittleThreshold+__svml_sasinh_data_internal(%rip), %xmm3
 251         movaps  %xmm3, %xmm13
 252         andps   %xmm3, %xmm12
 253         andnps  %xmm11, %xmm13
 254         movaps  %xmm3, %xmm1
 255         orps    %xmm12, %xmm13
 256         andnps  %xmm4, %xmm1
 257         andps   %xmm3, %xmm10
 258         movaps  %xmm6, %xmm4
 259         orps    %xmm10, %xmm1
 260         addps   %xmm13, %xmm14
 261         mulps   %xmm15, %xmm6
 262         maxps   %xmm14, %xmm0
 263         minps   %xmm14, %xmm2
 264         subps   %xmm14, %xmm4
 265         movaps  %xmm0, %xmm3
 266         addps   %xmm4, %xmm13
 267         addps   %xmm2, %xmm3
 268         addps   %xmm13, %xmm1
 269         subps   %xmm3, %xmm0
 270         movaps  %xmm5, %xmm4
 271         andps   %xmm5, %xmm3
 272         andnps  %xmm6, %xmm4
 273         addps   %xmm0, %xmm2
 274
 275         /*
 276          * Now resume the main code.
 277          * reduction: compute r, n
 278          */
 279         movdqu  iBrkValue+__svml_sasinh_data_internal(%rip), %xmm6
 280         orps    %xmm3, %xmm4
 281         psubd   %xmm6, %xmm4
 282         movaps  %xmm7, %xmm0
 283         addps   %xmm2, %xmm1
 284         movdqu  iOffExpoMask+__svml_sasinh_data_internal(%rip), %xmm2
 285         pand    %xmm4, %xmm2
 286         psrad   $23, %xmm4
 287         cvtdq2ps %xmm4, %xmm3
 288         pslld   $23, %xmm4
 289         andps   %xmm5, %xmm1
 290         paddd   %xmm6, %xmm2
 291         psubd   %xmm4, %xmm0
 292         mulps   %xmm0, %xmm1
 293
 294         /* polynomial evaluation */
 295         subps   %xmm7, %xmm2
 296         movups  sPoly+112+__svml_sasinh_data_internal(%rip), %xmm7
 297         addps   %xmm2, %xmm1
 298         mulps   %xmm1, %xmm7
 299         movaps  %xmm5, %xmm2
 300
 301         /* Add 31 to the exponent in the "large" case to get log(2 * input) */
 302         movups  sThirtyOne+__svml_sasinh_data_internal(%rip), %xmm0
 303         addps   sPoly+96+__svml_sasinh_data_internal(%rip), %xmm7
 304         addps   %xmm3, %xmm0
 305         mulps   %xmm1, %xmm7
 306         andnps  %xmm0, %xmm2
 307         andps   %xmm5, %xmm3
 308         orps    %xmm3, %xmm2
 309         addps   sPoly+80+__svml_sasinh_data_internal(%rip), %xmm7
 310
 311         /* final reconstruction */
 312         mulps   sLn2+__svml_sasinh_data_internal(%rip), %xmm2
 313         mulps   %xmm1, %xmm7
 314
 315         /* Finally, reincorporate the original sign. */
 316         movups  sSign+__svml_sasinh_data_internal(%rip), %xmm0
 317         andps   %xmm8, %xmm0
 318         addps   sPoly+64+__svml_sasinh_data_internal(%rip), %xmm7
 319         mulps   %xmm1, %xmm7
 320         addps   sPoly+48+__svml_sasinh_data_internal(%rip), %xmm7
 321         mulps   %xmm1, %xmm7
 322         addps   sPoly+32+__svml_sasinh_data_internal(%rip), %xmm7
 323         mulps   %xmm1, %xmm7
 324         addps   sPoly+16+__svml_sasinh_data_internal(%rip), %xmm7
 325         mulps   %xmm1, %xmm7
 326         addps   sPoly+__svml_sasinh_data_internal(%rip), %xmm7
 327         mulps   %xmm1, %xmm7
 328         mulps   %xmm1, %xmm7
 329         addps   %xmm7, %xmm1
 330         addps   %xmm2, %xmm1
 331         pxor    %xmm1, %xmm0
 332         testl   %edx, %edx
 333
 334         /* Go to special inputs processing branch */
 335         jne     L(SPECIAL_VALUES_BRANCH)
 336         # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm8
 337
 338         /* Restore registers
 339          * and exit the function
 340          */
 341
 342 L(EXIT):
 343         addq    $72, %rsp
 344         cfi_def_cfa_offset(8)
 345         ret
 346         cfi_def_cfa_offset(80)
 347
 348         /* Branch to process
 349          * special inputs
 350          */
 351
 352 L(SPECIAL_VALUES_BRANCH):
 353         movups  %xmm8, 32(%rsp)
 354         movups  %xmm0, 48(%rsp)
 355         # LOE rbx rbp r12 r13 r14 r15 edx
 356
 357         xorl    %eax, %eax
 358         movq    %r12, 16(%rsp)
 359         cfi_offset(12, -64)
 360         movl    %eax, %r12d
 361         movq    %r13, 8(%rsp)
 362         cfi_offset(13, -72)
 363         movl    %edx, %r13d
 364         movq    %r14, (%rsp)
 365         cfi_offset(14, -80)
 366         # LOE rbx rbp r15 r12d r13d
 367
 368         /* Range mask
 369          * bits check
 370          */
 371
 372 L(RANGEMASK_CHECK):
 373         btl     %r12d, %r13d
 374
 375         /* Call scalar math function */
 376         jc      L(SCALAR_MATH_CALL)
 377         # LOE rbx rbp r15 r12d r13d
 378
 379         /* Special inputs
 380          * processing loop
 381          */
 382
 383 L(SPECIAL_VALUES_LOOP):
 384         incl    %r12d
 385         cmpl    $4, %r12d
 386
 387         /* Check bits in range mask */
 388         jl      L(RANGEMASK_CHECK)
 389         # LOE rbx rbp r15 r12d r13d
 390
 391         movq    16(%rsp), %r12
 392         cfi_restore(12)
 393         movq    8(%rsp), %r13
 394         cfi_restore(13)
 395         movq    (%rsp), %r14
 396         cfi_restore(14)
 397         movups  48(%rsp), %xmm0
 398
 399         /* Go to exit */
 400         jmp     L(EXIT)
 401         cfi_offset(12, -64)
 402         cfi_offset(13, -72)
 403         cfi_offset(14, -80)
 404         # LOE rbx rbp r12 r13 r14 r15 xmm0
 405
 406         /* Scalar math fucntion call
 407          * to process special input
 408          */
 409
 410 L(SCALAR_MATH_CALL):
 411         movl    %r12d, %r14d
 412         movss   32(%rsp, %r14, 4), %xmm0
 413         call    asinhf@PLT
 414         # LOE rbx rbp r14 r15 r12d r13d xmm0
 415
 416         movss   %xmm0, 48(%rsp, %r14, 4)
 417
 418         /* Process special inputs in loop */
 419         jmp     L(SPECIAL_VALUES_LOOP)
 420         # LOE rbx rbp r15 r12d r13d
 421 END(_ZGVbN4v_asinhf_sse4)
 422
 423         .section .rodata, "a"
 424         .align  16
 425
 426 #ifdef __svml_sasinh_data_internal_typedef
 427 typedef unsigned int VUINT32;
 428 typedef struct {
 429         __declspec(align(16)) VUINT32 SgnMask[4][1];
 430         __declspec(align(16)) VUINT32 sOne[4][1];
 431         __declspec(align(16)) VUINT32 sPoly[8][4][1];
 432         __declspec(align(16)) VUINT32 iBrkValue[4][1];
 433         __declspec(align(16)) VUINT32 iOffExpoMask[4][1];
 434         __declspec(align(16)) VUINT32 sBigThreshold[4][1];
 435         __declspec(align(16)) VUINT32 sC2[4][1];
 436         __declspec(align(16)) VUINT32 sC3[4][1];
 437         __declspec(align(16)) VUINT32 sHalf[4][1];
 438         __declspec(align(16)) VUINT32 sLargestFinite[4][1];
 439         __declspec(align(16)) VUINT32 sLittleThreshold[4][1];
 440         __declspec(align(16)) VUINT32 sSign[4][1];
 441         __declspec(align(16)) VUINT32 sThirtyOne[4][1];
 442         __declspec(align(16)) VUINT32 sTopMask11[4][1];
 443         __declspec(align(16)) VUINT32 sTopMask8[4][1];
 444         __declspec(align(16)) VUINT32 XScale[4][1];
 445         __declspec(align(16)) VUINT32 sLn2[4][1];
 446 } __svml_sasinh_data_internal;
 447 #endif
 448 __svml_sasinh_data_internal:
 449         /* SgnMask */
 450         .long   0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
 451         /* sOne = SP 1.0 */
 452         .align  16
 453         .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
 454         /* sPoly[] = SP polynomial */
 455         .align  16
 456         .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
 457         .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
 458         .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
 459         .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
 460         .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
 461         .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
 462         .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
 463         .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
 464         /* iBrkValue = SP 2/3 */
 465         .align  16
 466         .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
 467         /* iOffExpoMask = SP significand mask */
 468         .align  16
 469         .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
 470         /* sBigThreshold */
 471         .align  16
 472         .long   0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000
 473         /* sC2 */
 474         .align  16
 475         .long   0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000
 476         /* sC3 */
 477         .align  16
 478         .long   0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000
 479         /* sHalf */
 480         .align  16
 481         .long   0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
 482         /* sLargestFinite */
 483         .align  16
 484         .long   0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF
 485         /* sLittleThreshold */
 486         .align  16
 487         .long   0x3D800000, 0x3D800000, 0x3D800000, 0x3D800000
 488         /* sSign */
 489         .align  16
 490         .long   0x80000000, 0x80000000, 0x80000000, 0x80000000
 491         /* sThirtyOne */
 492         .align  16
 493         .long   0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000
 494         /* sTopMask11 */
 495         .align  16
 496         .long   0xFFFFE000, 0xFFFFE000, 0xFFFFE000, 0xFFFFE000
 497         /* sTopMask8 */
 498         .align  16
 499         .long   0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000
 500         /* XScale */
 501         .align  16
 502         .long   0x30800000, 0x30800000, 0x30800000, 0x30800000
 503         /* sLn2 = SP ln(2) */
 504         .align  16
 505         .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
 506         .align  16
 507         .type   __svml_sasinh_data_internal, @object
 508         .size   __svml_sasinh_data_internal, .-__svml_sasinh_data_internal