sysdeps/x86_64/fpu/svml_d_sincos4_core_avx.S

   1 /* Function sincos vectorized in AVX ISA as wrapper to SSE4 ISA version.
   2    Copyright (C) 2014-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20 #include "svml_d_wrapper_impl.h"
  21
  22         .section .text.avx2, "ax", @progbits
  23 ENTRY (_ZGVcN4vl8l8_sincos)
  24 WRAPPER_IMPL_AVX_fFF _ZGVbN2vl8l8_sincos
  25 END (_ZGVcN4vl8l8_sincos)
  26
  27 /* AVX ISA version as wrapper to SSE ISA version (for vector
  28    function declared with #pragma omp declare simd notinbranch).  */
  29 .macro WRAPPER_IMPL_AVX_fFF_vvv callee
  30 #ifndef __ILP32__
  31         pushq     %rbp
  32         movq      %rsp, %rbp
  33         andq      $-32, %rsp
  34         subq      $160, %rsp
  35         vmovupd   %ymm0, 64(%rsp)
  36         lea       (%rsp), %rdi
  37         vmovdqu   %xmm1, 96(%rdi)
  38         vmovdqu   %xmm2, 112(%rdi)
  39         vmovdqu   %xmm3, 128(%rdi)
  40         vmovdqu   %xmm4, 144(%rdi)
  41         lea       32(%rsp), %rsi
  42         vzeroupper
  43         call      HIDDEN_JUMPTARGET(\callee)
  44         vmovdqu   80(%rsp), %xmm0
  45         lea       16(%rsp), %rdi
  46         lea       48(%rsp), %rsi
  47         call      HIDDEN_JUMPTARGET(\callee)
  48         movq      96(%rsp), %rdx
  49         movq      104(%rsp), %rsi
  50         movq      112(%rsp), %r8
  51         movq      120(%rsp), %r10
  52         movq      (%rsp), %rax
  53         movq      8(%rsp), %rcx
  54         movq      16(%rsp), %rdi
  55         movq      24(%rsp), %r9
  56         movq      %rax, (%rdx)
  57         movq      %rcx, (%rsi)
  58         movq      128(%rsp), %rax
  59         movq      136(%rsp), %rcx
  60         movq      %rdi, (%r8)
  61         movq      %r9, (%r10)
  62         movq      144(%rsp), %rdi
  63         movq      152(%rsp), %r9
  64         movq      32(%rsp), %r11
  65         movq      40(%rsp), %rdx
  66         movq      48(%rsp), %rsi
  67         movq      56(%rsp), %r8
  68         movq      %r11, (%rax)
  69         movq      %rdx, (%rcx)
  70         movq      %rsi, (%rdi)
  71         movq      %r8, (%r9)
  72         movq      %rbp, %rsp
  73         popq      %rbp
  74         ret
  75 #else
  76         leal    8(%rsp), %r10d
  77         .cfi_def_cfa 10, 0
  78         andl    $-32, %esp
  79         pushq   -8(%r10d)
  80         pushq   %rbp
  81         .cfi_escape 0x10,0x6,0x2,0x76,0
  82         movl    %esp, %ebp
  83         pushq   %r12
  84         leal    -80(%rbp), %esi
  85         pushq   %r10
  86         .cfi_escape 0xf,0x3,0x76,0x70,0x6
  87         .cfi_escape 0x10,0xc,0x2,0x76,0x78
  88         leal    -112(%rbp), %edi
  89         movq    %rsi, %r12
  90         pushq   %rbx
  91         .cfi_escape 0x10,0x3,0x2,0x76,0x68
  92         movq    %rdi, %rbx
  93         subl    $152, %esp
  94         vmovaps %xmm1, -128(%ebp)
  95         vmovaps %xmm2, -144(%ebp)
  96         vmovapd %ymm0, -176(%ebp)
  97         vzeroupper
  98         call    HIDDEN_JUMPTARGET(\callee)
  99         leal    16(%r12), %esi
 100         vmovupd -160(%ebp), %xmm0
 101         leal    16(%rbx), %edi
 102         call    HIDDEN_JUMPTARGET(\callee)
 103         movq    -128(%ebp), %rax
 104         vmovsd  -112(%ebp), %xmm0
 105         vmovdqa -128(%ebp), %xmm5
 106         vmovdqa -144(%ebp), %xmm1
 107         vmovsd  %xmm0, (%eax)
 108         vmovsd  -104(%ebp), %xmm0
 109         vpextrd $1, %xmm5, %eax
 110         vmovsd  %xmm0, (%eax)
 111         movq    -120(%ebp), %rax
 112         vmovsd  -96(%ebp), %xmm0
 113         vmovsd  %xmm0, (%eax)
 114         vmovsd  -88(%ebp), %xmm0
 115         vpextrd $3, %xmm5, %eax
 116         vmovsd  %xmm0, (%eax)
 117         movq    -144(%ebp), %rax
 118         vmovsd  -80(%ebp), %xmm0
 119         vmovsd  %xmm0, (%eax)
 120         vmovsd  -72(%ebp), %xmm0
 121         vpextrd $1, %xmm1, %eax
 122         vmovsd  %xmm0, (%eax)
 123         movq    -136(%ebp), %rax
 124         vmovsd  -64(%ebp), %xmm0
 125         vmovsd  %xmm0, (%eax)
 126         vmovsd  -56(%ebp), %xmm0
 127         vpextrd $3, %xmm1, %eax
 128         vmovsd  %xmm0, (%eax)
 129         addl    $152, %esp
 130         popq    %rbx
 131         popq    %r10
 132         .cfi_def_cfa 10, 0
 133         popq    %r12
 134         popq    %rbp
 135         leal    -8(%r10), %esp
 136         .cfi_def_cfa 7, 8
 137         ret
 138 #endif
 139 .endm
 140
 141 ENTRY (_ZGVcN4vvv_sincos)
 142 WRAPPER_IMPL_AVX_fFF_vvv _ZGVbN2vl8l8_sincos
 143 END (_ZGVcN4vvv_sincos)