sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S

   1 /* Function cosf vectorized with AVX-512. KNL and SKX versions.
   2    Copyright (C) 2014-2016 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20 #include "svml_s_trig_data.h"
  21 #include "svml_s_wrapper_impl.h"
  22
  23         .text
  24 ENTRY (_ZGVeN16v_cosf_knl)
  25 #ifndef HAVE_AVX512_ASM_SUPPORT
  26 WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
  27 #else
  28 /*
  29   ALGORITHM DESCRIPTION:
  30
  31   1) Range reduction to [-Pi/2; +Pi/2] interval
  32      a) We remove sign using AND operation
  33      b) Add Pi/2 value to argument X for Cos to Sin transformation
  34      c) Getting octant Y by 1/Pi multiplication
  35      d) Add "Right Shifter" value
  36      e) Treat obtained value as integer for destination sign setting.
  37         Shift first bit of this value to the last (sign) position
  38      f) Subtract "Right Shifter"  value
  39      g) Subtract 0.5 from result for octant correction
  40      h) Subtract Y*PI from X argument, where PI divided to 4 parts:
  41         X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
  42   2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
  43      a) Calculate X^2 = X * X
  44      b) Calculate polynomial:
  45         R = X + X * X^2 * (A3 + x^2 * (A5 + .....
  46   3) Destination sign setting
  47      a) Set shifted destination sign using XOR operation:
  48         R = XOR( R, S );
  49  */
  50         pushq     %rbp
  51         cfi_adjust_cfa_offset (8)
  52         cfi_rel_offset (%rbp, 0)
  53         movq      %rsp, %rbp
  54         cfi_def_cfa_register (%rbp)
  55         andq      $-64, %rsp
  56         subq      $1280, %rsp
  57         movq      __svml_s_trig_data@GOTPCREL(%rip), %rdx
  58
  59 /*
  60   h) Subtract Y*PI from X argument, where PI divided to 4 parts:
  61   X = X - Y*PI1 - Y*PI2 - Y*PI3
  62  */
  63         vmovaps   %zmm0, %zmm6
  64         movl      $-1, %eax
  65
  66 /* b) Add Pi/2 value to argument X for Cos to Sin transformation */
  67         vaddps    __sHalfPI(%rdx), %zmm0, %zmm2
  68         vmovups   __sRShifter(%rdx), %zmm3
  69
  70 /*
  71   1) Range reduction to [-Pi/2; +Pi/2] interval
  72   c) Getting octant Y by 1/Pi multiplication
  73   d) Add "Right Shifter" (0x4B000000) value
  74  */
  75         vfmadd132ps __sInvPI(%rdx), %zmm3, %zmm2
  76         vmovups     __sPI1_FMA(%rdx), %zmm5
  77
  78 /* f) Subtract "Right Shifter" (0x4B000000) value */
  79         vsubps    %zmm3, %zmm2, %zmm4
  80         vmovups   __sA9_FMA(%rdx), %zmm9
  81
  82 /* Check for large and special arguments */
  83         vpandd    __sAbsMask(%rdx), %zmm0, %zmm1
  84
  85 /*
  86   e) Treat obtained value as integer for destination sign setting.
  87   Shift first bit of this value to the last (sign) position (S << 31)
  88  */
  89         vpslld       $31, %zmm2, %zmm8
  90         vcmpps       $22, __sRangeReductionVal(%rdx), %zmm1, %k1
  91         vpbroadcastd %eax, %zmm12{%k1}{z}
  92
  93 /* g) Subtract 0.5 from result for octant correction */
  94         vsubps       __sOneHalf(%rdx), %zmm4, %zmm7
  95         vptestmd     %zmm12, %zmm12, %k0
  96         vfnmadd231ps %zmm7, %zmm5, %zmm6
  97         kmovw        %k0, %ecx
  98         vfnmadd231ps __sPI2_FMA(%rdx), %zmm7, %zmm6
  99         vfnmadd132ps __sPI3_FMA(%rdx), %zmm6, %zmm7
 100
 101 /* a) Calculate X^2 = X * X */
 102         vmulps    %zmm7, %zmm7, %zmm10
 103
 104 /*
 105   3) Destination sign setting
 106     a) Set shifted destination sign using XOR operation:
 107   R = XOR( R, S );
 108  */
 109         vpxord    %zmm8, %zmm7, %zmm11
 110
 111 /*
 112   b) Calculate polynomial:
 113   R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
 114  */
 115         vfmadd213ps __sA7_FMA(%rdx), %zmm10, %zmm9
 116         vfmadd213ps __sA5_FMA(%rdx), %zmm10, %zmm9
 117         vfmadd213ps __sA3(%rdx), %zmm10, %zmm9
 118         vmulps      %zmm10, %zmm9, %zmm1
 119         vfmadd213ps %zmm11, %zmm11, %zmm1
 120         testl       %ecx, %ecx
 121         jne         .LBL_1_3
 122
 123 .LBL_1_2:
 124         cfi_remember_state
 125         vmovaps   %zmm1, %zmm0
 126         movq      %rbp, %rsp
 127         cfi_def_cfa_register (%rsp)
 128         popq      %rbp
 129         cfi_adjust_cfa_offset (-8)
 130         cfi_restore (%rbp)
 131         ret
 132
 133 .LBL_1_3:
 134         cfi_restore_state
 135         vmovups   %zmm0, 1152(%rsp)
 136         vmovups   %zmm1, 1216(%rsp)
 137         je        .LBL_1_2
 138
 139         xorb      %dl, %dl
 140         kmovw     %k4, 1048(%rsp)
 141         xorl      %eax, %eax
 142         kmovw     %k5, 1040(%rsp)
 143         kmovw     %k6, 1032(%rsp)
 144         kmovw     %k7, 1024(%rsp)
 145         vmovups   %zmm16, 960(%rsp)
 146         vmovups   %zmm17, 896(%rsp)
 147         vmovups   %zmm18, 832(%rsp)
 148         vmovups   %zmm19, 768(%rsp)
 149         vmovups   %zmm20, 704(%rsp)
 150         vmovups   %zmm21, 640(%rsp)
 151         vmovups   %zmm22, 576(%rsp)
 152         vmovups   %zmm23, 512(%rsp)
 153         vmovups   %zmm24, 448(%rsp)
 154         vmovups   %zmm25, 384(%rsp)
 155         vmovups   %zmm26, 320(%rsp)
 156         vmovups   %zmm27, 256(%rsp)
 157         vmovups   %zmm28, 192(%rsp)
 158         vmovups   %zmm29, 128(%rsp)
 159         vmovups   %zmm30, 64(%rsp)
 160         vmovups   %zmm31, (%rsp)
 161         movq      %rsi, 1064(%rsp)
 162         movq      %rdi, 1056(%rsp)
 163         movq      %r12, 1096(%rsp)
 164         cfi_offset_rel_rsp (12, 1096)
 165         movb      %dl, %r12b
 166         movq      %r13, 1088(%rsp)
 167         cfi_offset_rel_rsp (13, 1088)
 168         movl      %ecx, %r13d
 169         movq      %r14, 1080(%rsp)
 170         cfi_offset_rel_rsp (14, 1080)
 171         movl      %eax, %r14d
 172         movq      %r15, 1072(%rsp)
 173         cfi_offset_rel_rsp (15, 1072)
 174         cfi_remember_state
 175
 176 .LBL_1_6:
 177         btl       %r14d, %r13d
 178         jc        .LBL_1_12
 179
 180 .LBL_1_7:
 181         lea       1(%r14), %esi
 182         btl       %esi, %r13d
 183         jc        .LBL_1_10
 184
 185 .LBL_1_8:
 186         addb      $1, %r12b
 187         addl      $2, %r14d
 188         cmpb      $16, %r12b
 189         jb        .LBL_1_6
 190
 191         kmovw     1048(%rsp), %k4
 192         movq      1064(%rsp), %rsi
 193         kmovw     1040(%rsp), %k5
 194         movq      1056(%rsp), %rdi
 195         kmovw     1032(%rsp), %k6
 196         movq      1096(%rsp), %r12
 197         cfi_restore (%r12)
 198         movq      1088(%rsp), %r13
 199         cfi_restore (%r13)
 200         kmovw     1024(%rsp), %k7
 201         vmovups   960(%rsp), %zmm16
 202         vmovups   896(%rsp), %zmm17
 203         vmovups   832(%rsp), %zmm18
 204         vmovups   768(%rsp), %zmm19
 205         vmovups   704(%rsp), %zmm20
 206         vmovups   640(%rsp), %zmm21
 207         vmovups   576(%rsp), %zmm22
 208         vmovups   512(%rsp), %zmm23
 209         vmovups   448(%rsp), %zmm24
 210         vmovups   384(%rsp), %zmm25
 211         vmovups   320(%rsp), %zmm26
 212         vmovups   256(%rsp), %zmm27
 213         vmovups   192(%rsp), %zmm28
 214         vmovups   128(%rsp), %zmm29
 215         vmovups   64(%rsp), %zmm30
 216         vmovups   (%rsp), %zmm31
 217         movq      1080(%rsp), %r14
 218         cfi_restore (%r14)
 219         movq      1072(%rsp), %r15
 220         cfi_restore (%r15)
 221         vmovups   1216(%rsp), %zmm1
 222         jmp       .LBL_1_2
 223
 224 .LBL_1_10:
 225         cfi_restore_state
 226         movzbl    %r12b, %r15d
 227         vmovss    1156(%rsp,%r15,8), %xmm0
 228         call      JUMPTARGET(cosf)
 229         vmovss    %xmm0, 1220(%rsp,%r15,8)
 230         jmp       .LBL_1_8
 231
 232 .LBL_1_12:
 233         movzbl    %r12b, %r15d
 234         vmovss    1152(%rsp,%r15,8), %xmm0
 235         call      JUMPTARGET(cosf)
 236         vmovss    %xmm0, 1216(%rsp,%r15,8)
 237         jmp       .LBL_1_7
 238 #endif
 239 END (_ZGVeN16v_cosf_knl)
 240
 241 ENTRY (_ZGVeN16v_cosf_skx)
 242 #ifndef HAVE_AVX512_ASM_SUPPORT
 243 WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
 244 #else
 245 /*
 246   ALGORITHM DESCRIPTION:
 247
 248   1) Range reduction to [-Pi/2; +Pi/2] interval
 249      a) We remove sign using AND operation
 250      b) Add Pi/2 value to argument X for Cos to Sin transformation
 251      c) Getting octant Y by 1/Pi multiplication
 252      d) Add "Right Shifter" value
 253      e) Treat obtained value as integer for destination sign setting.
 254         Shift first bit of this value to the last (sign) position
 255      f) Subtract "Right Shifter"  value
 256      g) Subtract 0.5 from result for octant correction
 257      h) Subtract Y*PI from X argument, where PI divided to 4 parts:
 258         X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
 259   2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
 260      a) Calculate X^2 = X * X
 261      b) Calculate polynomial:
 262         R = X + X * X^2 * (A3 + x^2 * (A5 + .....
 263   3) Destination sign setting
 264      a) Set shifted destination sign using XOR operation:
 265         R = XOR( R, S );
 266  */
 267         pushq     %rbp
 268         cfi_adjust_cfa_offset (8)
 269         cfi_rel_offset (%rbp, 0)
 270         movq      %rsp, %rbp
 271         cfi_def_cfa_register (%rbp)
 272         andq      $-64, %rsp
 273         subq      $1280, %rsp
 274         movq      __svml_s_trig_data@GOTPCREL(%rip), %rax
 275
 276 /*
 277   h) Subtract Y*PI from X argument, where PI divided to 4 parts:
 278   X = X - Y*PI1 - Y*PI2 - Y*PI3
 279  */
 280         vmovaps   %zmm0, %zmm6
 281         vmovups   .L_2il0floatpacket.13(%rip), %zmm12
 282         vmovups __sRShifter(%rax), %zmm3
 283         vmovups __sPI1_FMA(%rax), %zmm5
 284         vmovups __sA9_FMA(%rax), %zmm9
 285
 286 /* b) Add Pi/2 value to argument X for Cos to Sin transformation */
 287         vaddps __sHalfPI(%rax), %zmm0, %zmm2
 288
 289 /* Check for large and special arguments */
 290         vandps __sAbsMask(%rax), %zmm0, %zmm1
 291
 292 /*
 293   1) Range reduction to [-Pi/2; +Pi/2] interval
 294   c) Getting octant Y by 1/Pi multiplication
 295   d) Add "Right Shifter" (0x4B000000) value
 296  */
 297         vfmadd132ps __sInvPI(%rax), %zmm3, %zmm2
 298         vcmpps    $18, __sRangeReductionVal(%rax), %zmm1, %k1
 299
 300 /*
 301   e) Treat obtained value as integer for destination sign setting.
 302   Shift first bit of this value to the last (sign) position (S << 31)
 303  */
 304         vpslld    $31, %zmm2, %zmm8
 305
 306 /* f) Subtract "Right Shifter" (0x4B000000) value */
 307         vsubps    %zmm3, %zmm2, %zmm4
 308
 309 /* g) Subtract 0.5 from result for octant correction */
 310         vsubps __sOneHalf(%rax), %zmm4, %zmm7
 311         vfnmadd231ps %zmm7, %zmm5, %zmm6
 312         vfnmadd231ps __sPI2_FMA(%rax), %zmm7, %zmm6
 313         vfnmadd132ps __sPI3_FMA(%rax), %zmm6, %zmm7
 314
 315 /* a) Calculate X^2 = X * X */
 316         vmulps    %zmm7, %zmm7, %zmm10
 317
 318 /*
 319   3) Destination sign setting
 320   a) Set shifted destination sign using XOR operation:
 321   R = XOR( R, S );
 322  */
 323         vxorps    %zmm8, %zmm7, %zmm11
 324
 325 /*
 326   b) Calculate polynomial:
 327   R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
 328  */
 329         vfmadd213ps __sA7_FMA(%rax), %zmm10, %zmm9
 330         vfmadd213ps __sA5_FMA(%rax), %zmm10, %zmm9
 331         vfmadd213ps __sA3(%rax), %zmm10, %zmm9
 332         vpandnd   %zmm1, %zmm1, %zmm12{%k1}
 333         vmulps    %zmm10, %zmm9, %zmm1
 334         vptestmd  %zmm12, %zmm12, %k0
 335         vfmadd213ps %zmm11, %zmm11, %zmm1
 336         kmovw     %k0, %ecx
 337         testl     %ecx, %ecx
 338         jne       .LBL_2_3
 339 .LBL_2_2:
 340         cfi_remember_state
 341         vmovaps   %zmm1, %zmm0
 342         movq      %rbp, %rsp
 343         cfi_def_cfa_register (%rsp)
 344         popq      %rbp
 345         cfi_adjust_cfa_offset (-8)
 346         cfi_restore (%rbp)
 347         ret
 348
 349 .LBL_2_3:
 350         cfi_restore_state
 351         vmovups   %zmm0, 1152(%rsp)
 352         vmovups   %zmm1, 1216(%rsp)
 353         je        .LBL_2_2
 354
 355         xorb      %dl, %dl
 356         xorl      %eax, %eax
 357         kmovw     %k4, 1048(%rsp)
 358         kmovw     %k5, 1040(%rsp)
 359         kmovw     %k6, 1032(%rsp)
 360         kmovw     %k7, 1024(%rsp)
 361         vmovups   %zmm16, 960(%rsp)
 362         vmovups   %zmm17, 896(%rsp)
 363         vmovups   %zmm18, 832(%rsp)
 364         vmovups   %zmm19, 768(%rsp)
 365         vmovups   %zmm20, 704(%rsp)
 366         vmovups   %zmm21, 640(%rsp)
 367         vmovups   %zmm22, 576(%rsp)
 368         vmovups   %zmm23, 512(%rsp)
 369         vmovups   %zmm24, 448(%rsp)
 370         vmovups   %zmm25, 384(%rsp)
 371         vmovups   %zmm26, 320(%rsp)
 372         vmovups   %zmm27, 256(%rsp)
 373         vmovups   %zmm28, 192(%rsp)
 374         vmovups   %zmm29, 128(%rsp)
 375         vmovups   %zmm30, 64(%rsp)
 376         vmovups   %zmm31, (%rsp)
 377         movq      %rsi, 1064(%rsp)
 378         movq      %rdi, 1056(%rsp)
 379         movq      %r12, 1096(%rsp)
 380         cfi_offset_rel_rsp (12, 1096)
 381         movb      %dl, %r12b
 382         movq      %r13, 1088(%rsp)
 383         cfi_offset_rel_rsp (13, 1088)
 384         movl      %ecx, %r13d
 385         movq      %r14, 1080(%rsp)
 386         cfi_offset_rel_rsp (14, 1080)
 387         movl      %eax, %r14d
 388         movq      %r15, 1072(%rsp)
 389         cfi_offset_rel_rsp (15, 1072)
 390         cfi_remember_state
 391
 392 .LBL_2_6:
 393         btl       %r14d, %r13d
 394         jc        .LBL_2_12
 395 .LBL_2_7:
 396         lea       1(%r14), %esi
 397         btl       %esi, %r13d
 398         jc        .LBL_2_10
 399 .LBL_2_8:
 400         incb      %r12b
 401         addl      $2, %r14d
 402         cmpb      $16, %r12b
 403         jb        .LBL_2_6
 404         kmovw     1048(%rsp), %k4
 405         kmovw     1040(%rsp), %k5
 406         kmovw     1032(%rsp), %k6
 407         kmovw     1024(%rsp), %k7
 408         vmovups   960(%rsp), %zmm16
 409         vmovups   896(%rsp), %zmm17
 410         vmovups   832(%rsp), %zmm18
 411         vmovups   768(%rsp), %zmm19
 412         vmovups   704(%rsp), %zmm20
 413         vmovups   640(%rsp), %zmm21
 414         vmovups   576(%rsp), %zmm22
 415         vmovups   512(%rsp), %zmm23
 416         vmovups   448(%rsp), %zmm24
 417         vmovups   384(%rsp), %zmm25
 418         vmovups   320(%rsp), %zmm26
 419         vmovups   256(%rsp), %zmm27
 420         vmovups   192(%rsp), %zmm28
 421         vmovups   128(%rsp), %zmm29
 422         vmovups   64(%rsp), %zmm30
 423         vmovups   (%rsp), %zmm31
 424         vmovups   1216(%rsp), %zmm1
 425         movq      1064(%rsp), %rsi
 426         movq      1056(%rsp), %rdi
 427         movq      1096(%rsp), %r12
 428         cfi_restore (%r12)
 429         movq      1088(%rsp), %r13
 430         cfi_restore (%r13)
 431         movq      1080(%rsp), %r14
 432         cfi_restore (%r14)
 433         movq      1072(%rsp), %r15
 434         cfi_restore (%r15)
 435         jmp       .LBL_2_2
 436
 437 .LBL_2_10:
 438         cfi_restore_state
 439         movzbl    %r12b, %r15d
 440         vmovss    1156(%rsp,%r15,8), %xmm0
 441         vzeroupper
 442         vmovss    1156(%rsp,%r15,8), %xmm0
 443         call      JUMPTARGET(cosf)
 444         vmovss    %xmm0, 1220(%rsp,%r15,8)
 445         jmp       .LBL_2_8
 446 .LBL_2_12:
 447         movzbl    %r12b, %r15d
 448         vmovss    1152(%rsp,%r15,8), %xmm0
 449         vzeroupper
 450         vmovss    1152(%rsp,%r15,8), %xmm0
 451         call      JUMPTARGET(cosf)
 452         vmovss    %xmm0, 1216(%rsp,%r15,8)
 453         jmp       .LBL_2_7
 454 #endif
 455 END (_ZGVeN16v_cosf_skx)
 456
 457         .section .rodata, "a"
 458 .L_2il0floatpacket.13:
 459         .long   0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
 460         .type   .L_2il0floatpacket.13,@object