sysdeps/x86_64/fpu/multiarch/svml_s_acoshf8_core_avx2.S

   1 /* Function acoshf vectorized with AVX2.
   2    Copyright (C) 2021-2023 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    https://www.gnu.org/licenses/.  */
  18
  19 /*
  20  * ALGORITHM DESCRIPTION:
  21  *
  22  *   Compute acosh(x) as log(x + sqrt(x*x - 1))
  23  *
  24  *   Special cases:
  25  *
  26  *   acosh(NaN)  = quiet NaN, and raise invalid exception
  27  *   acosh(-INF) = NaN
  28  *   acosh(+INF) = +INF
  29  *   acosh(x)    = NaN if x < 1
  30  *   acosh(1)    = +0
  31  *
  32  */
  33
  34 /* Offsets for data table __svml_sacosh_data_internal
  35  */
  36 #define sOne                            0
  37 #define sPoly                           32
  38 #define iBrkValue                       288
  39 #define iOffExpoMask                    320
  40 #define sBigThreshold                   352
  41 #define sC2                             384
  42 #define sC3                             416
  43 #define sHalf                           448
  44 #define sLargestFinite                  480
  45 #define sThirtyOne                      512
  46 #define sTopMask8                       544
  47 #define XScale                          576
  48 #define sLn2                            608
  49
  50 #include <sysdep.h>
  51
  52         .section .text.avx2, "ax", @progbits
  53 ENTRY(_ZGVdN8v_acoshf_avx2)
  54         pushq   %rbp
  55         cfi_def_cfa_offset(16)
  56         movq    %rsp, %rbp
  57         cfi_def_cfa(6, 16)
  58         cfi_offset(6, -16)
  59         andq    $-32, %rsp
  60         subq    $96, %rsp
  61
  62         /* Load constants, always including One = 1 */
  63         vmovups sOne+__svml_sacosh_data_internal(%rip), %ymm2
  64
  65         /* Finally, express Y + W = U * V accurately where Y has <= 8 bits */
  66         vmovups sTopMask8+__svml_sacosh_data_internal(%rip), %ymm9
  67
  68         /*
  69          * Now       1 / (1 + d)
  70          * = 1 / (1 + (sqrt(1 - e) - 1))
  71          * = 1 / sqrt(1 - e)
  72          * = 1 + 1/2 * e + 3/8 * e^2 + 5/16 * e^3 + 35/128 * e^4 + ...
  73          * So compute the first three nonconstant terms of that, so that
  74          * we have a relative correction (1 + Corr) to apply to S etc.
  75          * C1 = 1/2
  76          * C2 = 3/8
  77          * C3 = 5/16
  78          */
  79         vmovups sC3+__svml_sacosh_data_internal(%rip), %ymm14
  80         vmovaps %ymm0, %ymm3
  81         vmovaps %ymm2, %ymm7
  82         vfmsub231ps %ymm3, %ymm3, %ymm7
  83
  84         /*
  85          * Check that 1 < X < +inf; otherwise go to the callout function.
  86          * We need the callout for X = 1 to avoid division by zero below.
  87          * This test ensures that callout handles NaN and either infinity.
  88          */
  89         vcmpnle_uqps sLargestFinite+__svml_sacosh_data_internal(%rip), %ymm3, %ymm4
  90         vcmpngt_uqps %ymm2, %ymm3, %ymm5
  91
  92         /*
  93          * The following computation can go wrong for very large X, e.g.
  94          * the X^2 - 1 = U * V can overflow. But for large X we have
  95          * acosh(X) / log(2 X) - 1 =~= 1/(4 * X^2), so for X >= 2^30
  96          * we can just later stick X back into the log and tweak up the exponent.
  97          * Actually we scale X by 2^-30 and tweak the exponent up by 31,
  98          * to stay in the safe range for the later log computation.
  99          * Compute a flag now telling us when to do this.
 100          */
 101         vcmplt_oqps sBigThreshold+__svml_sacosh_data_internal(%rip), %ymm3, %ymm1
 102         vandps  %ymm9, %ymm7, %ymm10
 103
 104         /*
 105          * Compute R = 1/sqrt(Y + W) * (1 + d)
 106          * Force R to <= 8 significant bits.
 107          * This means that R * Y and R^2 * Y are exactly representable.
 108          */
 109         vrsqrtps %ymm10, %ymm8
 110         vsubps  %ymm10, %ymm7, %ymm11
 111         vandps  %ymm9, %ymm8, %ymm12
 112
 113         /*
 114          * Compute S = (Y/sqrt(Y + W)) * (1 + d)
 115          * and T = (W/sqrt(Y + W)) * (1 + d)
 116          * so that S + T = sqrt(Y + W) * (1 + d)
 117          * S is exact, and the rounding error in T is OK.
 118          */
 119         vmulps  %ymm12, %ymm10, %ymm15
 120         vmulps  %ymm11, %ymm12, %ymm0
 121
 122         /* Now multiplex to the case X = 2^-30 * input, Xl = 0 in the "big" case. */
 123         vmulps  XScale+__svml_sacosh_data_internal(%rip), %ymm3, %ymm11
 124
 125         /*
 126          * Compute e = -(2 * d + d^2)
 127          * The first FMR is exact, and the rounding error in the other is acceptable
 128          * since d and e are ~ 2^-8
 129          */
 130         vmovaps %ymm2, %ymm13
 131         vfnmadd231ps %ymm15, %ymm12, %ymm13
 132         vfnmadd231ps %ymm0, %ymm12, %ymm13
 133         vfmadd213ps sC2+__svml_sacosh_data_internal(%rip), %ymm13, %ymm14
 134         vfmadd213ps sHalf+__svml_sacosh_data_internal(%rip), %ymm13, %ymm14
 135         vmulps  %ymm14, %ymm13, %ymm7
 136         vorps   %ymm5, %ymm4, %ymm6
 137
 138         /*
 139          * For low-accuracy versions, the computation can be done
 140          * just as U + ((S + T) + (S + T) * Corr)
 141          */
 142         vaddps  %ymm0, %ymm15, %ymm5
 143
 144         /* sU is needed later on */
 145         vsubps  %ymm2, %ymm3, %ymm4
 146         vfmadd213ps %ymm5, %ymm7, %ymm5
 147         vmovmskps %ymm6, %edx
 148         vaddps  %ymm5, %ymm4, %ymm6
 149
 150         /*
 151          * Now resume the main code.
 152          * reduction: compute r, n
 153          */
 154         vmovups iBrkValue+__svml_sacosh_data_internal(%rip), %ymm4
 155
 156         /*
 157          * Now we feed into the log1p code, using H in place of _VARG1 and
 158          * also adding L into Xl.
 159          * compute 1+x as high, low parts
 160          */
 161         vmaxps  %ymm6, %ymm2, %ymm8
 162         vminps  %ymm6, %ymm2, %ymm9
 163         vaddps  %ymm9, %ymm8, %ymm12
 164         vblendvps %ymm1, %ymm12, %ymm11, %ymm14
 165         vsubps  %ymm12, %ymm8, %ymm10
 166         vpsubd  %ymm4, %ymm14, %ymm15
 167         vaddps  %ymm10, %ymm9, %ymm13
 168         vpand   iOffExpoMask+__svml_sacosh_data_internal(%rip), %ymm15, %ymm14
 169         vpsrad  $23, %ymm15, %ymm15
 170         vpaddd  %ymm4, %ymm14, %ymm8
 171         vpslld  $23, %ymm15, %ymm5
 172         vmovups sPoly+224+__svml_sacosh_data_internal(%rip), %ymm4
 173         vcvtdq2ps %ymm15, %ymm0
 174         vpsubd  %ymm5, %ymm2, %ymm7
 175
 176         /* polynomial evaluation */
 177         vsubps  %ymm2, %ymm8, %ymm2
 178
 179         /* Add 31 to the exponent in the "large" case to get log(2 * input) */
 180         vaddps  sThirtyOne+__svml_sacosh_data_internal(%rip), %ymm0, %ymm5
 181         vandps  %ymm1, %ymm13, %ymm6
 182         vmulps  %ymm7, %ymm6, %ymm9
 183         vblendvps %ymm1, %ymm0, %ymm5, %ymm0
 184         vaddps  %ymm2, %ymm9, %ymm2
 185         vfmadd213ps sPoly+192+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
 186         vfmadd213ps sPoly+160+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
 187         vfmadd213ps sPoly+128+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
 188         vfmadd213ps sPoly+96+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
 189         vfmadd213ps sPoly+64+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
 190         vfmadd213ps sPoly+32+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
 191         vfmadd213ps sPoly+__svml_sacosh_data_internal(%rip), %ymm2, %ymm4
 192         vmulps  %ymm4, %ymm2, %ymm6
 193         vfmadd213ps %ymm2, %ymm2, %ymm6
 194
 195         /* final reconstruction */
 196         vfmadd132ps sLn2+__svml_sacosh_data_internal(%rip), %ymm6, %ymm0
 197         testl   %edx, %edx
 198
 199         /* Go to special inputs processing branch */
 200         jne     L(SPECIAL_VALUES_BRANCH)
 201         # LOE rbx r12 r13 r14 r15 edx ymm0 ymm3
 202
 203         /* Restore registers
 204          * and exit the function
 205          */
 206
 207 L(EXIT):
 208         movq    %rbp, %rsp
 209         popq    %rbp
 210         cfi_def_cfa(7, 8)
 211         cfi_restore(6)
 212         ret
 213         cfi_def_cfa(6, 16)
 214         cfi_offset(6, -16)
 215
 216         /* Branch to process
 217          * special inputs
 218          */
 219
 220 L(SPECIAL_VALUES_BRANCH):
 221         vmovups %ymm3, 32(%rsp)
 222         vmovups %ymm0, 64(%rsp)
 223         # LOE rbx r12 r13 r14 r15 edx ymm0
 224
 225         xorl    %eax, %eax
 226         # LOE rbx r12 r13 r14 r15 eax edx
 227
 228         vzeroupper
 229         movq    %r12, 16(%rsp)
 230         /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
 231         .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
 232         movl    %eax, %r12d
 233         movq    %r13, 8(%rsp)
 234         /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
 235         .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
 236         movl    %edx, %r13d
 237         movq    %r14, (%rsp)
 238         /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
 239         .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
 240         # LOE rbx r15 r12d r13d
 241
 242         /* Range mask
 243          * bits check
 244          */
 245
 246 L(RANGEMASK_CHECK):
 247         btl     %r12d, %r13d
 248
 249         /* Call scalar math function */
 250         jc      L(SCALAR_MATH_CALL)
 251         # LOE rbx r15 r12d r13d
 252
 253         /* Special inputs
 254          * processing loop
 255          */
 256
 257 L(SPECIAL_VALUES_LOOP):
 258         incl    %r12d
 259         cmpl    $8, %r12d
 260
 261         /* Check bits in range mask */
 262         jl      L(RANGEMASK_CHECK)
 263         # LOE rbx r15 r12d r13d
 264
 265         movq    16(%rsp), %r12
 266         cfi_restore(12)
 267         movq    8(%rsp), %r13
 268         cfi_restore(13)
 269         movq    (%rsp), %r14
 270         cfi_restore(14)
 271         vmovups 64(%rsp), %ymm0
 272
 273         /* Go to exit */
 274         jmp     L(EXIT)
 275         /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
 276         .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
 277         /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
 278         .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
 279         /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
 280         .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
 281         # LOE rbx r12 r13 r14 r15 ymm0
 282
 283         /* Scalar math fucntion call
 284          * to process special input
 285          */
 286
 287 L(SCALAR_MATH_CALL):
 288         movl    %r12d, %r14d
 289         vmovss  32(%rsp, %r14, 4), %xmm0
 290         call    acoshf@PLT
 291         # LOE rbx r14 r15 r12d r13d xmm0
 292
 293         vmovss  %xmm0, 64(%rsp, %r14, 4)
 294
 295         /* Process special inputs in loop */
 296         jmp     L(SPECIAL_VALUES_LOOP)
 297         # LOE rbx r15 r12d r13d
 298 END(_ZGVdN8v_acoshf_avx2)
 299
 300         .section .rodata, "a"
 301         .align  32
 302
 303 #ifdef __svml_sacosh_data_internal_typedef
 304 typedef unsigned int VUINT32;
 305 typedef struct {
 306         __declspec(align(32)) VUINT32 sOne[8][1];
 307         __declspec(align(32)) VUINT32 sPoly[8][8][1];
 308         __declspec(align(32)) VUINT32 iBrkValue[8][1];
 309         __declspec(align(32)) VUINT32 iOffExpoMask[8][1];
 310         __declspec(align(32)) VUINT32 sBigThreshold[8][1];
 311         __declspec(align(32)) VUINT32 sC2[8][1];
 312         __declspec(align(32)) VUINT32 sC3[8][1];
 313         __declspec(align(32)) VUINT32 sHalf[8][1];
 314         __declspec(align(32)) VUINT32 sLargestFinite[8][1];
 315         __declspec(align(32)) VUINT32 sThirtyOne[8][1];
 316         __declspec(align(32)) VUINT32 sTopMask8[8][1];
 317         __declspec(align(32)) VUINT32 XScale[8][1];
 318         __declspec(align(32)) VUINT32 sLn2[8][1];
 319 } __svml_sacosh_data_internal;
 320 #endif
 321 __svml_sacosh_data_internal:
 322         /* sOne = SP 1.0 */
 323         .long   0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
 324         /* sPoly[] = SP polynomial */
 325         .align  32
 326         .long   0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000 /* -5.0000000000000000000000000e-01 P0 */
 327         .long   0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94, 0x3eaaaa94 /* 3.3333265781402587890625000e-01 P1 */
 328         .long   0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e, 0xbe80058e /* -2.5004237890243530273437500e-01 P2 */
 329         .long   0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190, 0x3e4ce190 /* 2.0007920265197753906250000e-01 P3 */
 330         .long   0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37, 0xbe28ad37 /* -1.6472326219081878662109375e-01 P4 */
 331         .long   0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12, 0x3e0fcb12 /* 1.4042308926582336425781250e-01 P5 */
 332         .long   0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3, 0xbe1ad9e3 /* -1.5122179687023162841796875e-01 P6 */
 333         .long   0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed, 0x3e0d84ed /* 1.3820238411426544189453125e-01 P7 */
 334         /* iBrkValue = SP 2/3 */
 335         .align  32
 336         .long   0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
 337         /* iOffExpoMask = SP significand mask */
 338         .align  32
 339         .long   0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
 340         /* sBigThreshold */
 341         .align  32
 342         .long   0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000, 0x4E800000
 343         /* sC2 */
 344         .align  32
 345         .long   0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000, 0x3EC00000
 346         /* sC3 */
 347         .align  32
 348         .long   0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000, 0x3EA00000
 349         /* sHalf */
 350         .align  32
 351         .long   0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000
 352         /* sLargestFinite */
 353         .align  32
 354         .long   0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF
 355         /* sThirtyOne */
 356         .align  32
 357         .long   0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000, 0x41F80000
 358         /* sTopMask8 */
 359         .align  32
 360         .long   0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000
 361         /* XScale */
 362         .align  32
 363         .long   0x30800000, 0x30800000, 0x30800000, 0x30800000, 0x30800000, 0x30800000, 0x30800000, 0x30800000
 364         /* sLn2 = SP ln(2) */
 365         .align  32
 366         .long   0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218
 367         .align  32
 368         .type   __svml_sacosh_data_internal, @object
 369         .size   __svml_sacosh_data_internal, .-__svml_sacosh_data_internal