sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S

   1 /* Function cosf vectorized with SSE4.
   2    Copyright (C) 2014-2015 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20 #include "svml_s_cosf_data.h"
  21
  22         .text
  23 ENTRY (_ZGVbN4v_cosf_sse4)
  24 /*
  25   ALGORITHM DESCRIPTION:
  26
  27    1) Range reduction to [-Pi/2; +Pi/2] interval
  28       a) We remove sign using AND operation
  29       b) Add Pi/2 value to argument X for Cos to Sin transformation
  30       c) Getting octant Y by 1/Pi multiplication
  31       d) Add "Right Shifter" value
  32       e) Treat obtained value as integer for destination sign setting.
  33          Shift first bit of this value to the last (sign) position
  34       f) Subtract "Right Shifter"  value
  35       g) Subtract 0.5 from result for octant correction
  36       h) Subtract Y*PI from X argument, where PI divided to 4 parts:
  37          X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
  38    2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
  39       a) Calculate X^2 = X * X
  40       b) Calculate polynomial:
  41          R = X + X * X^2 * (A3 + x^2 * (A5 + .....
  42    3) Destination sign setting
  43       a) Set shifted destination sign using XOR operation:
  44          R = XOR( R, S );
  45  */
  46         pushq     %rbp
  47         cfi_adjust_cfa_offset (8)
  48         cfi_rel_offset (%rbp, 0)
  49         movq      %rsp, %rbp
  50         cfi_def_cfa_register (%rbp)
  51         andq      $-64, %rsp
  52         subq      $320, %rsp
  53         movaps    %xmm0, %xmm4
  54         movq      __svml_scos_data@GOTPCREL(%rip), %rax
  55         movups __sHalfPI(%rax), %xmm1
  56         movups __sRShifter(%rax), %xmm5
  57
  58 /* b) Add Pi/2 value to argument X for Cos to Sin transformation */
  59         addps     %xmm4, %xmm1
  60
  61 /*
  62   1) Range reduction to [-Pi/2; +Pi/2] interval
  63   c) Getting octant Y by 1/Pi multiplication
  64   d) Add "Right Shifter" (0x4B000000) value
  65  */
  66         mulps __sInvPI(%rax), %xmm1
  67         movups __sPI1(%rax), %xmm6
  68         addps     %xmm5, %xmm1
  69
  70 /*
  71   e) Treat obtained value as integer for destination sign setting.
  72   Shift first bit of this value to the last (sign) position (S << 31)
  73  */
  74         movaps    %xmm1, %xmm2
  75
  76 /* f) Subtract "Right Shifter" (0x4B000000) value */
  77         subps     %xmm5, %xmm1
  78         movups __sPI2(%rax), %xmm7
  79         pslld     $31, %xmm2
  80         movups __sPI3(%rax), %xmm5
  81         movups __sAbsMask(%rax), %xmm3
  82
  83 /* Check for large and special arguments */
  84         andps     %xmm4, %xmm3
  85
  86 /* g) Subtract 0.5 from result for octant correction */
  87         subps __sOneHalf(%rax), %xmm1
  88         cmpnleps __sRangeReductionVal(%rax), %xmm3
  89
  90 /*
  91   h) Subtract Y*PI from X argument, where PI divided to 4 parts:
  92   X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
  93  */
  94         mulps     %xmm1, %xmm6
  95         mulps     %xmm1, %xmm7
  96         mulps     %xmm1, %xmm5
  97         subps     %xmm6, %xmm0
  98         movmskps  %xmm3, %ecx
  99         movups __sPI4(%rax), %xmm6
 100         subps     %xmm7, %xmm0
 101         mulps     %xmm6, %xmm1
 102         subps     %xmm5, %xmm0
 103         subps     %xmm1, %xmm0
 104
 105 /* a) Calculate X^2 = X * X */
 106         movaps    %xmm0, %xmm1
 107         mulps     %xmm0, %xmm1
 108
 109 /*
 110   3) Destination sign setting
 111   a) Set shifted destination sign using XOR operation:
 112   R = XOR( R, S );
 113  */
 114         xorps     %xmm2, %xmm0
 115         movups __sA9(%rax), %xmm2
 116
 117 /*
 118   b) Calculate polynomial:
 119   R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
 120  */
 121         mulps     %xmm1, %xmm2
 122         addps __sA7(%rax), %xmm2
 123         mulps     %xmm1, %xmm2
 124         addps __sA5(%rax), %xmm2
 125         mulps     %xmm1, %xmm2
 126         addps __sA3(%rax), %xmm2
 127         mulps     %xmm2, %xmm1
 128         mulps     %xmm0, %xmm1
 129         addps     %xmm1, %xmm0
 130         testl     %ecx, %ecx
 131         jne       .LBL_1_3
 132
 133 .LBL_1_2:
 134         cfi_remember_state
 135         movq      %rbp, %rsp
 136         cfi_def_cfa_register (%rsp)
 137         popq      %rbp
 138         cfi_adjust_cfa_offset (-8)
 139         cfi_restore (%rbp)
 140         ret
 141
 142 .LBL_1_3:
 143         cfi_restore_state
 144         movups    %xmm4, 192(%rsp)
 145         movups    %xmm0, 256(%rsp)
 146         je        .LBL_1_2
 147
 148         xorb      %dl, %dl
 149         xorl      %eax, %eax
 150         movups    %xmm8, 112(%rsp)
 151         movups    %xmm9, 96(%rsp)
 152         movups    %xmm10, 80(%rsp)
 153         movups    %xmm11, 64(%rsp)
 154         movups    %xmm12, 48(%rsp)
 155         movups    %xmm13, 32(%rsp)
 156         movups    %xmm14, 16(%rsp)
 157         movups    %xmm15, (%rsp)
 158         movq      %rsi, 136(%rsp)
 159         movq      %rdi, 128(%rsp)
 160         movq      %r12, 168(%rsp)
 161         cfi_offset_rel_rsp (12, 168)
 162         movb      %dl, %r12b
 163         movq      %r13, 160(%rsp)
 164         cfi_offset_rel_rsp (13, 160)
 165         movl      %ecx, %r13d
 166         movq      %r14, 152(%rsp)
 167         cfi_offset_rel_rsp (14, 152)
 168         movl      %eax, %r14d
 169         movq      %r15, 144(%rsp)
 170         cfi_offset_rel_rsp (15, 144)
 171         cfi_remember_state
 172
 173 .LBL_1_6:
 174         btl       %r14d, %r13d
 175         jc        .LBL_1_12
 176
 177 .LBL_1_7:
 178         lea       1(%r14), %esi
 179         btl       %esi, %r13d
 180         jc        .LBL_1_10
 181
 182 .LBL_1_8:
 183         incb      %r12b
 184         addl      $2, %r14d
 185         cmpb      $16, %r12b
 186         jb        .LBL_1_6
 187
 188         movups    112(%rsp), %xmm8
 189         movups    96(%rsp), %xmm9
 190         movups    80(%rsp), %xmm10
 191         movups    64(%rsp), %xmm11
 192         movups    48(%rsp), %xmm12
 193         movups    32(%rsp), %xmm13
 194         movups    16(%rsp), %xmm14
 195         movups    (%rsp), %xmm15
 196         movq      136(%rsp), %rsi
 197         movq      128(%rsp), %rdi
 198         movq      168(%rsp), %r12
 199         cfi_restore (%r12)
 200         movq      160(%rsp), %r13
 201         cfi_restore (%r13)
 202         movq      152(%rsp), %r14
 203         cfi_restore (%r14)
 204         movq      144(%rsp), %r15
 205         cfi_restore (%r15)
 206         movups    256(%rsp), %xmm0
 207         jmp       .LBL_1_2
 208
 209 .LBL_1_10:
 210         cfi_restore_state
 211         movzbl    %r12b, %r15d
 212         movss     196(%rsp,%r15,8), %xmm0
 213
 214         call      cosf@PLT
 215
 216         movss     %xmm0, 260(%rsp,%r15,8)
 217         jmp       .LBL_1_8
 218
 219 .LBL_1_12:
 220         movzbl    %r12b, %r15d
 221         movss     192(%rsp,%r15,8), %xmm0
 222
 223         call      cosf@PLT
 224
 225         movss     %xmm0, 256(%rsp,%r15,8)
 226         jmp       .LBL_1_7
 227 END (_ZGVbN4v_cosf_sse4)