sysdeps/x86_64/fpu/svml_d_sincos4_core.S

   1 /* Function sincos vectorized with AVX2, wrapper version.
   2    Copyright (C) 2014-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20 #include "svml_d_wrapper_impl.h"
  21
  22         .section .text.avx2, "ax", @progbits
  23 ENTRY (_ZGVdN4vl8l8_sincos)
  24 WRAPPER_IMPL_AVX_fFF _ZGVbN2vl8l8_sincos
  25 END (_ZGVdN4vl8l8_sincos)
  26 libmvec_hidden_def (_ZGVdN4vl8l8_sincos)
  27
  28 /* AVX2 ISA version as wrapper to SSE ISA version (for vector
  29    function declared with #pragma omp declare simd notinbranch).  */
  30 .macro WRAPPER_IMPL_AVX2_fFF_vvv callee
  31 #ifndef __ILP32__
  32         pushq     %rbp
  33         cfi_adjust_cfa_offset (8)
  34         cfi_rel_offset (%rbp, 0)
  35         movq      %rsp, %rbp
  36         cfi_def_cfa_register (%rbp)
  37         andq      $-32, %rsp
  38         subq      $160, %rsp
  39         vmovupd   %ymm0, 128(%rsp)
  40         lea       (%rsp), %rdi
  41         vmovdqu   %ymm1, 64(%rdi)
  42         vmovdqu   %ymm2, 96(%rdi)
  43         lea       32(%rsp), %rsi
  44         vzeroupper
  45         call      HIDDEN_JUMPTARGET(\callee)
  46         vmovupd   144(%rsp), %xmm0
  47         lea       16(%rsp), %rdi
  48         lea       48(%rsp), %rsi
  49         call      HIDDEN_JUMPTARGET(\callee)
  50         movq      64(%rsp), %rdx
  51         movq      96(%rsp), %rsi
  52         movq      72(%rsp), %r8
  53         movq      104(%rsp), %r10
  54         movq      (%rsp), %rax
  55         movq      32(%rsp), %rcx
  56         movq      8(%rsp), %rdi
  57         movq      40(%rsp), %r9
  58         movq      %rax, (%rdx)
  59         movq      %rcx, (%rsi)
  60         movq      80(%rsp), %rax
  61         movq      112(%rsp), %rcx
  62         movq      %rdi, (%r8)
  63         movq      %r9, (%r10)
  64         movq      88(%rsp), %rdi
  65         movq      120(%rsp), %r9
  66         movq      16(%rsp), %r11
  67         movq      48(%rsp), %rdx
  68         movq      24(%rsp), %rsi
  69         movq      56(%rsp), %r8
  70         movq      %r11, (%rax)
  71         movq      %rdx, (%rcx)
  72         movq      %rsi, (%rdi)
  73         movq      %r8, (%r9)
  74         movq      %rbp, %rsp
  75         cfi_def_cfa_register (%rsp)
  76         popq      %rbp
  77         cfi_adjust_cfa_offset (-8)
  78         cfi_restore (%rbp)
  79         ret
  80 #else
  81         leal    8(%rsp), %r10d
  82         .cfi_def_cfa 10, 0
  83         andl    $-32, %esp
  84         pushq   -8(%r10d)
  85         pushq   %rbp
  86         .cfi_escape 0x10,0x6,0x2,0x76,0
  87         movl    %esp, %ebp
  88         pushq   %r12
  89         leal    -80(%rbp), %esi
  90         pushq   %r10
  91         .cfi_escape 0xf,0x3,0x76,0x70,0x6
  92         .cfi_escape 0x10,0xc,0x2,0x76,0x78
  93         leal    -112(%rbp), %edi
  94         movq    %rsi, %r12
  95         pushq   %rbx
  96         .cfi_escape 0x10,0x3,0x2,0x76,0x68
  97         movq    %rdi, %rbx
  98         subl    $152, %esp
  99         vmovaps %xmm1, -128(%ebp)
 100         vmovaps %xmm2, -144(%ebp)
 101         vmovapd %ymm0, -176(%ebp)
 102         vzeroupper
 103         call    HIDDEN_JUMPTARGET(\callee)
 104         leal    16(%r12), %esi
 105         vmovapd -160(%ebp), %xmm0
 106         leal    16(%rbx), %edi
 107         call    HIDDEN_JUMPTARGET(\callee)
 108         movq    -128(%ebp), %rax
 109         vmovsd  -112(%ebp), %xmm0
 110         vmovdqa -128(%ebp), %xmm5
 111         vmovdqa -144(%ebp), %xmm1
 112         vmovsd  %xmm0, (%eax)
 113         vmovsd  -104(%ebp), %xmm0
 114         vpextrd $1, %xmm5, %eax
 115         vmovsd  %xmm0, (%eax)
 116         movq    -120(%ebp), %rax
 117         vmovsd  -96(%ebp), %xmm0
 118         vmovsd  %xmm0, (%eax)
 119         vmovsd  -88(%ebp), %xmm0
 120         vpextrd $3, %xmm5, %eax
 121         vmovsd  %xmm0, (%eax)
 122         movq    -144(%ebp), %rax
 123         vmovsd  -80(%ebp), %xmm0
 124         vmovsd  %xmm0, (%eax)
 125         vmovsd  -72(%ebp), %xmm0
 126         vpextrd $1, %xmm1, %eax
 127         vmovsd  %xmm0, (%eax)
 128         movq    -136(%ebp), %rax
 129         vmovsd  -64(%ebp), %xmm0
 130         vmovsd  %xmm0, (%eax)
 131         vmovsd  -56(%ebp), %xmm0
 132         vpextrd $3, %xmm1, %eax
 133         vmovsd  %xmm0, (%eax)
 134         addl    $152, %esp
 135         popq    %rbx
 136         popq    %r10
 137         .cfi_def_cfa 10, 0
 138         popq    %r12
 139         popq    %rbp
 140         leal    -8(%r10), %esp
 141         .cfi_def_cfa 7, 8
 142         ret
 143 #endif
 144 .endm
 145
 146 ENTRY (_ZGVdN4vvv_sincos)
 147 WRAPPER_IMPL_AVX2_fFF_vvv _ZGVbN2vl8l8_sincos
 148 END (_ZGVdN4vvv_sincos)
 149
 150 #ifndef USE_MULTIARCH
 151  libmvec_hidden_def (_ZGVdN4vvv_sincos)
 152 #endif