sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core_sse4.S

   1 /* Function powf vectorized with SSE4.
   2    Copyright (C) 2014-2016 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20 #include "svml_s_powf_data.h"
  21
  22         .text
  23 ENTRY (_ZGVbN4vv_powf_sse4)
  24 /*
  25    ALGORITHM DESCRIPTION:
  26
  27      We are using the next identity: pow(x,y) = 2^(y * log2(x)).
  28
  29      1) log2(x) calculation
  30         Here we use the following formula.
  31         Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2.
  32         Let C ~= 1/ln(2),
  33         Rcp1 ~= 1/X1,   X2=Rcp1*X1,
  34         Rcp2 ~= 1/X2,   X3=Rcp2*X2,
  35         Rcp3 ~= 1/X3,   Rcp3C ~= C/X3.
  36         Then
  37           log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) +
  38                     log2(X1*Rcp1*Rcp2*Rcp3C/C),
  39         where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small.
  40
  41         The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2),
  42         Rcp3C, log2(C/Rcp3C) are taken from tables.
  43         Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C
  44         is exactly represented in target precision.
  45
  46         log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 =
  47              = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... =
  48              = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... =
  49              = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ...,
  50         where
  51              cq=X1*Rcp1*Rcp2*Rcp3C-C,
  52              a1=1/(C*ln(2))-1 is small,
  53              a2=1/(2*C^2*ln2),
  54              a3=1/(3*C^3*ln2),
  55                   ...
  56         Log2 result is split by three parts: HH+HL+HLL
  57
  58      2) Calculation of y*log2(x)
  59         Split y into YHi+YLo.
  60         Get high PH and medium PL parts of y*log2|x|.
  61         Get low PLL part of y*log2|x|.
  62         Now we have PH+PL+PLL ~= y*log2|x|.
  63
  64      3) Calculation of 2^(y*log2(x))
  65         Let's represent PH+PL+PLL in the form N + j/2^expK + Z,
  66         where expK=7 in this implementation, N and j are integers,
  67         0<=j<=2^expK-1, |Z|<2^(-expK-1). Hence
  68         2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z,
  69         where 2^(j/2^expK) is stored in a table, and
  70         2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5.
  71         We compute 2^(PH+PL+PLL) as follows:
  72         Break PH into PHH + PHL, where PHH = N + j/2^expK.
  73         Z = PHL + PL + PLL
  74         Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5
  75         Get 2^(j/2^expK) from table in the form THI+TLO.
  76         Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly).
  77         Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo:
  78         ResHi := THI
  79         ResLo := THI * Exp2Poly + TLO
  80         Get exponent ERes of the result:
  81         Res := ResHi + ResLo:
  82         Result := ex(Res) + N.  */
  83
  84         pushq     %rbp
  85         cfi_adjust_cfa_offset (8)
  86         cfi_rel_offset (%rbp, 0)
  87         movq      %rsp, %rbp
  88         cfi_def_cfa_register (%rbp)
  89         andq      $-64, %rsp
  90         subq      $256, %rsp
  91         movaps    %xmm0, %xmm3
  92         movhlps   %xmm0, %xmm3
  93         movaps    %xmm1, %xmm5
  94         movups    %xmm8, 112(%rsp)
  95         movaps    %xmm5, %xmm2
  96         cvtps2pd  %xmm3, %xmm8
  97         cvtps2pd  %xmm5, %xmm7
  98         movups    %xmm9, 96(%rsp)
  99         movaps    %xmm0, %xmm4
 100         cvtps2pd  %xmm0, %xmm9
 101         movq      __svml_spow_data@GOTPCREL(%rip), %rdx
 102         movups    %xmm10, 176(%rsp)
 103         movups    %xmm13, 48(%rsp)
 104         movups _ExpMask(%rdx), %xmm6
 105
 106 /* preserve mantissa, set input exponent to 2^(-10) */
 107         movaps    %xmm6, %xmm10
 108         andps     %xmm8, %xmm6
 109         andps     %xmm9, %xmm10
 110
 111 /* exponent bits selection */
 112         psrlq     $20, %xmm9
 113         orps _Two10(%rdx), %xmm6
 114         psrlq     $20, %xmm8
 115         orps _Two10(%rdx), %xmm10
 116
 117 /* reciprocal approximation good to at least 11 bits */
 118         cvtpd2ps  %xmm6, %xmm13
 119         cvtpd2ps  %xmm10, %xmm1
 120         movlhps   %xmm13, %xmm13
 121         movhlps   %xmm5, %xmm2
 122         movlhps   %xmm1, %xmm1
 123         movups    %xmm12, 208(%rsp)
 124         rcpps     %xmm13, %xmm12
 125         movups    %xmm11, 80(%rsp)
 126         cvtps2pd  %xmm2, %xmm11
 127         rcpps     %xmm1, %xmm2
 128         movups    %xmm14, 144(%rsp)
 129         cvtps2pd  %xmm12, %xmm14
 130         movups    %xmm15, 160(%rsp)
 131         cvtps2pd  %xmm2, %xmm15
 132         shufps    $221, %xmm8, %xmm9
 133
 134 /* round reciprocal to nearest integer, will have 1+9 mantissa bits */
 135         roundpd   $0, %xmm14, %xmm14
 136
 137 /* biased exponent in DP format */
 138         pshufd    $238, %xmm9, %xmm8
 139         roundpd   $0, %xmm15, %xmm15
 140         cvtdq2pd  %xmm8, %xmm1
 141         mulpd     %xmm15, %xmm10
 142         mulpd     %xmm14, %xmm6
 143         cvtdq2pd  %xmm9, %xmm2
 144         subpd _One(%rdx), %xmm10
 145         subpd _One(%rdx), %xmm6
 146
 147 /* table lookup */
 148         movaps    %xmm14, %xmm8
 149         movaps    %xmm15, %xmm9
 150         psrlq     $40, %xmm8
 151         psrlq     $40, %xmm9
 152         movd      %xmm8, %r8d
 153         movd      %xmm9, %eax
 154         psubd _NMINNORM(%rdx), %xmm4
 155         movdqu _ABSMASK(%rdx), %xmm3
 156         pextrd    $2, %xmm8, %r9d
 157         pand      %xmm5, %xmm3
 158         movups _Threshold(%rdx), %xmm8
 159         pextrd    $2, %xmm9, %ecx
 160         movaps    %xmm8, %xmm9
 161         cmpltpd   %xmm15, %xmm9
 162         cmpltpd   %xmm14, %xmm8
 163         andps _Bias(%rdx), %xmm9
 164         movaps    %xmm10, %xmm14
 165         andps _Bias(%rdx), %xmm8
 166         movaps    %xmm6, %xmm15
 167         orps _Bias1(%rdx), %xmm9
 168         orps _Bias1(%rdx), %xmm8
 169         subpd     %xmm9, %xmm2
 170         subpd     %xmm8, %xmm1
 171         mulpd     %xmm10, %xmm14
 172         mulpd     %xmm6, %xmm15
 173         mulpd _L2(%rdx), %xmm2
 174         mulpd _L2(%rdx), %xmm1
 175         movups _poly_coeff_3(%rdx), %xmm9
 176         movaps    %xmm9, %xmm8
 177         mulpd     %xmm10, %xmm8
 178         mulpd     %xmm6, %xmm9
 179         addpd _poly_coeff_4(%rdx), %xmm8
 180         addpd _poly_coeff_4(%rdx), %xmm9
 181         mulpd     %xmm14, %xmm8
 182         mulpd     %xmm15, %xmm9
 183
 184 /* reconstruction */
 185         addpd     %xmm8, %xmm10
 186         addpd     %xmm9, %xmm6
 187         movslq    %eax, %rax
 188         movslq    %r8d, %r8
 189         movslq    %ecx, %rcx
 190         movslq    %r9d, %r9
 191         movsd     _Log2Rcp_lookup(%rdx,%rax), %xmm13
 192         movsd     _Log2Rcp_lookup(%rdx,%r8), %xmm12
 193         movhpd    _Log2Rcp_lookup(%rdx,%rcx), %xmm13
 194         movhpd    _Log2Rcp_lookup(%rdx,%r9), %xmm12
 195         addpd     %xmm10, %xmm13
 196         addpd     %xmm6, %xmm12
 197         addpd     %xmm13, %xmm2
 198         addpd     %xmm12, %xmm1
 199         mulpd     %xmm7, %xmm2
 200         mulpd     %xmm11, %xmm1
 201         movups __dbInvLn2(%rdx), %xmm11
 202         movdqa    %xmm4, %xmm12
 203         movaps    %xmm11, %xmm10
 204         mulpd     %xmm2, %xmm10
 205         mulpd     %xmm1, %xmm11
 206
 207 /* to round down; if dR is an integer we will get R = 1, which is ok */
 208         movaps    %xmm10, %xmm8
 209         movaps    %xmm11, %xmm9
 210         subpd __dbHALF(%rdx), %xmm8
 211         subpd __dbHALF(%rdx), %xmm9
 212         addpd __dbShifter(%rdx), %xmm8
 213         addpd __dbShifter(%rdx), %xmm9
 214         movaps    %xmm8, %xmm6
 215         movaps    %xmm9, %xmm7
 216         subpd __dbShifter(%rdx), %xmm6
 217         subpd __dbShifter(%rdx), %xmm7
 218
 219 /* [0..1) */
 220         subpd     %xmm6, %xmm10
 221         subpd     %xmm7, %xmm11
 222         mulpd __dbC1(%rdx), %xmm10
 223         mulpd __dbC1(%rdx), %xmm11
 224
 225 /* hi bits */
 226         shufps    $221, %xmm1, %xmm2
 227         movdqu _NMAXVAL(%rdx), %xmm1
 228         pcmpgtd   %xmm1, %xmm12
 229         pcmpeqd   %xmm1, %xmm4
 230         por       %xmm4, %xmm12
 231         movdqa    %xmm3, %xmm1
 232         movdqu _INF(%rdx), %xmm4
 233         pcmpgtd   %xmm4, %xmm1
 234         pcmpeqd   %xmm4, %xmm3
 235
 236 /* iAbsX = iAbsX&iAbsMask */
 237         pand __iAbsMask(%rdx), %xmm2
 238         por       %xmm3, %xmm1
 239
 240 /* iRangeMask = (iAbsX>iDomainRange) */
 241         pcmpgtd __iDomainRange(%rdx), %xmm2
 242         por       %xmm1, %xmm12
 243         movups __lbLOWKBITS(%rdx), %xmm3
 244         por       %xmm2, %xmm12
 245
 246 /* low K bits */
 247         movaps    %xmm3, %xmm2
 248         andps     %xmm9, %xmm3
 249         andps     %xmm8, %xmm2
 250         psrlq     $11, %xmm8
 251
 252 /* dpP= _dbT+lJ*T_ITEM_GRAN */
 253         movd      %xmm2, %r10d
 254         psrlq     $11, %xmm9
 255         movd      %xmm3, %ecx
 256
 257 /* NB : including +/- sign for the exponent!! */
 258         psllq     $52, %xmm8
 259         psllq     $52, %xmm9
 260         pextrw    $4, %xmm2, %r11d
 261         pextrw    $4, %xmm3, %r8d
 262         movmskps  %xmm12, %eax
 263         shll      $3, %r10d
 264         shll      $3, %ecx
 265         shll      $3, %r11d
 266         shll      $3, %r8d
 267         movq      13952(%rdx,%r10), %xmm6
 268         movq      13952(%rdx,%rcx), %xmm7
 269         movhpd    13952(%rdx,%r11), %xmm6
 270         movhpd    13952(%rdx,%r8), %xmm7
 271         mulpd     %xmm6, %xmm10
 272         mulpd     %xmm7, %xmm11
 273         addpd     %xmm10, %xmm6
 274         addpd     %xmm11, %xmm7
 275         paddq     %xmm8, %xmm6
 276         paddq     %xmm9, %xmm7
 277         cvtpd2ps  %xmm6, %xmm1
 278         cvtpd2ps  %xmm7, %xmm4
 279         movlhps   %xmm4, %xmm1
 280         testl     %eax, %eax
 281         jne       .LBL_1_3
 282
 283 .LBL_1_2:
 284         cfi_remember_state
 285         movups    112(%rsp), %xmm8
 286         movaps    %xmm1, %xmm0
 287         movups    96(%rsp), %xmm9
 288         movups    176(%rsp), %xmm10
 289         movups    80(%rsp), %xmm11
 290         movups    208(%rsp), %xmm12
 291         movups    48(%rsp), %xmm13
 292         movups    144(%rsp), %xmm14
 293         movups    160(%rsp), %xmm15
 294         movq      %rbp, %rsp
 295         cfi_def_cfa_register (%rsp)
 296         popq      %rbp
 297         cfi_adjust_cfa_offset (-8)
 298         cfi_restore (%rbp)
 299         ret
 300
 301 .LBL_1_3:
 302         cfi_restore_state
 303         movups    %xmm0, 64(%rsp)
 304         movups    %xmm5, 128(%rsp)
 305         movups    %xmm1, 192(%rsp)
 306         je        .LBL_1_2
 307
 308         xorb      %cl, %cl
 309         xorl      %edx, %edx
 310         movq      %rsi, 8(%rsp)
 311         movq      %rdi, (%rsp)
 312         movq      %r12, 40(%rsp)
 313         cfi_offset_rel_rsp (12, 40)
 314         movb      %cl, %r12b
 315         movq      %r13, 32(%rsp)
 316         cfi_offset_rel_rsp (13, 32)
 317         movl      %eax, %r13d
 318         movq      %r14, 24(%rsp)
 319         cfi_offset_rel_rsp (14, 24)
 320         movl      %edx, %r14d
 321         movq      %r15, 16(%rsp)
 322         cfi_offset_rel_rsp (15, 16)
 323         cfi_remember_state
 324
 325 .LBL_1_6:
 326         btl       %r14d, %r13d
 327         jc        .LBL_1_12
 328
 329 .LBL_1_7:
 330         lea       1(%r14), %esi
 331         btl       %esi, %r13d
 332         jc        .LBL_1_10
 333
 334 .LBL_1_8:
 335         incb      %r12b
 336         addl      $2, %r14d
 337         cmpb      $16, %r12b
 338         jb        .LBL_1_6
 339
 340         movq      8(%rsp), %rsi
 341         movq      (%rsp), %rdi
 342         movq      40(%rsp), %r12
 343         cfi_restore (%r12)
 344         movq      32(%rsp), %r13
 345         cfi_restore (%r13)
 346         movq      24(%rsp), %r14
 347         cfi_restore (%r14)
 348         movq      16(%rsp), %r15
 349         cfi_restore (%r15)
 350         movups    192(%rsp), %xmm1
 351         jmp       .LBL_1_2
 352
 353 .LBL_1_10:
 354         cfi_restore_state
 355         movzbl    %r12b, %r15d
 356         movss     68(%rsp,%r15,8), %xmm0
 357         movss     132(%rsp,%r15,8), %xmm1
 358
 359         call      JUMPTARGET(powf)
 360
 361         movss     %xmm0, 196(%rsp,%r15,8)
 362         jmp       .LBL_1_8
 363
 364 .LBL_1_12:
 365         movzbl    %r12b, %r15d
 366         movss     64(%rsp,%r15,8), %xmm0
 367         movss     128(%rsp,%r15,8), %xmm1
 368
 369         call      JUMPTARGET(powf)
 370
 371         movss     %xmm0, 192(%rsp,%r15,8)
 372         jmp       .LBL_1_7
 373
 374 END (_ZGVbN4vv_powf_sse4)