sysdeps/x86_64/fpu/multiarch/svml_d_hypot8_core_avx512.S

   1 /* Function hypot vectorized with AVX-512.
   2    Copyright (C) 2021-2023 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    https://www.gnu.org/licenses/.  */
  18
  19 /*
  20  * ALGORITHM DESCRIPTION:
  21  *
  22  *      HIGH LEVEL OVERVIEW
  23  *
  24  *      Calculate z = (x*x+y*y)
  25  *      Calculate reciplicle sqrt (z)
  26  *      Calculate error = z*(rsqrt(z)*rsqrt(z)) - 1
  27  *      Calculate fixing part p with polynom
  28  *      Fix answer with sqrt(z) = z * rsqrt(z) + error * p * z
  29  *
  30  *      ALGORITHM DETAILS
  31  *
  32  *    Multiprecision branch for _HA_ only
  33  *      Remove sigm from both arguments
  34  *      Find maximum (_x) and minimum (_y) (by abs value) between arguments
  35  *      Split _x int _a and _b for multiprecision
  36  *      If _x >> _y we will we will not split _y for multiprecision
  37  *      all _y will be put into lower part (_d) and higher part (_c = 0)
  38  *      Fixing _hilo_mask for the case _x >> _y
  39  *      Split _y into _c and _d for multiprecision with fixed mask
  40  *
  41  *      compute Hi and Lo parts of _z = _x*_x + _y*_y
  42  *
  43  *      _zHi = _a*_a + _c*_c
  44  *      _zLo = (_x + _a)*_b + _d*_y + _d*_c
  45  *      _z = _zHi + _zLo
  46  *
  47  *    No multiprecision branch for _LA_ and _EP_
  48  *      _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
  49  *
  50  *    Check _z exponent to be withing borders [3BC ; 441] else goto Callout
  51  *
  52  *    _s  ~ 1.0/sqrt(_z)
  53  *    _s2 ~ 1.0/(sqrt(_z)*sqrt(_z)) ~ 1.0/_z = (1.0/_z + O)
  54  *    _e[rror]  =  (1.0/_z + O) * _z - 1.0
  55  *    calculate fixing part _p
  56  *    _p = (((_POLY_C5*_e + _POLY_C4)*_e +_POLY_C3)*_e +_POLY_C2)*_e + _POLY_C1
  57  *    some parts of polynom are skipped for lower flav
  58  *
  59  *    result = _z * (1.0/sqrt(_z) + O) + _p * _e[rror] * _z
  60  *
  61  *
  62  */
  63
  64 /* Offsets for data table __svml_dhypot_data_internal
  65  */
  66 #define _dAbsMask                       0
  67 #define _lExpBound_uisa                 64
  68 #define _lExpBound                      128
  69 #define _dHalf                          192
  70
  71 #include <sysdep.h>
  72
  73         .section .text.evex512, "ax", @progbits
  74 ENTRY(_ZGVeN8vv_hypot_skx)
  75         pushq   %rbp
  76         cfi_def_cfa_offset(16)
  77         movq    %rsp, %rbp
  78         cfi_def_cfa(6, 16)
  79         cfi_offset(6, -16)
  80         andq    $-64, %rsp
  81         subq    $256, %rsp
  82         vgetexppd {sae}, %zmm0, %zmm2
  83         vgetexppd {sae}, %zmm1, %zmm3
  84         vmovups _dHalf+__svml_dhypot_data_internal(%rip), %zmm9
  85         vmaxpd  {sae}, %zmm3, %zmm2, %zmm4
  86         vmulpd  {rn-sae}, %zmm0, %zmm0, %zmm2
  87         vandpd  _dAbsMask+__svml_dhypot_data_internal(%rip), %zmm4, %zmm5
  88         vfmadd231pd {rn-sae}, %zmm1, %zmm1, %zmm2
  89
  90         /* Select exponent bound so that no scaling is needed */
  91         vpcmpq  $5, _lExpBound_uisa+__svml_dhypot_data_internal(%rip), %zmm5, %k0
  92         vrsqrt14pd %zmm2, %zmm6
  93         kmovw   %k0, %edx
  94         vmulpd  {rn-sae}, %zmm6, %zmm2, %zmm7
  95         vmulpd  {rn-sae}, %zmm6, %zmm9, %zmm8
  96         vfnmadd231pd {rn-sae}, %zmm7, %zmm8, %zmm9
  97         vfmadd231pd {rn-sae}, %zmm9, %zmm8, %zmm8
  98         vfmadd213pd {rn-sae}, %zmm7, %zmm7, %zmm9
  99         vfnmadd231pd {rn-sae}, %zmm9, %zmm9, %zmm2
 100         vfmadd213pd {rn-sae}, %zmm9, %zmm8, %zmm2
 101
 102         /*  The end of implementation  */
 103         testl   %edx, %edx
 104
 105         /* Go to special inputs processing branch */
 106         jne     L(SPECIAL_VALUES_BRANCH)
 107         # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1 zmm2
 108
 109         /* Restore registers
 110          * and exit the function
 111          */
 112
 113 L(EXIT):
 114         vmovaps %zmm2, %zmm0
 115         movq    %rbp, %rsp
 116         popq    %rbp
 117         cfi_def_cfa(7, 8)
 118         cfi_restore(6)
 119         ret
 120         cfi_def_cfa(6, 16)
 121         cfi_offset(6, -16)
 122
 123         /* Branch to process
 124          * special inputs
 125          */
 126
 127 L(SPECIAL_VALUES_BRANCH):
 128         vmovups %zmm0, 64(%rsp)
 129         vmovups %zmm1, 128(%rsp)
 130         vmovups %zmm2, 192(%rsp)
 131         # LOE rbx r12 r13 r14 r15 edx zmm2
 132
 133         xorl    %eax, %eax
 134         # LOE rbx r12 r13 r14 r15 eax edx
 135
 136         vzeroupper
 137         movq    %r12, 16(%rsp)
 138         /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -240; DW_OP_plus)  */
 139         .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x10, 0xff, 0xff, 0xff, 0x22
 140         movl    %eax, %r12d
 141         movq    %r13, 8(%rsp)
 142         /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -248; DW_OP_plus)  */
 143         .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x08, 0xff, 0xff, 0xff, 0x22
 144         movl    %edx, %r13d
 145         movq    %r14, (%rsp)
 146         /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -256; DW_OP_plus)  */
 147         .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x00, 0xff, 0xff, 0xff, 0x22
 148         # LOE rbx r15 r12d r13d
 149
 150         /* Range mask
 151          * bits check
 152          */
 153
 154 L(RANGEMASK_CHECK):
 155         btl     %r12d, %r13d
 156
 157         /* Call scalar math function */
 158         jc      L(SCALAR_MATH_CALL)
 159         # LOE rbx r15 r12d r13d
 160
 161         /* Special inputs
 162          * processing loop
 163          */
 164
 165 L(SPECIAL_VALUES_LOOP):
 166         incl    %r12d
 167         cmpl    $8, %r12d
 168
 169         /* Check bits in range mask */
 170         jl      L(RANGEMASK_CHECK)
 171         # LOE rbx r15 r12d r13d
 172
 173         movq    16(%rsp), %r12
 174         cfi_restore(12)
 175         movq    8(%rsp), %r13
 176         cfi_restore(13)
 177         movq    (%rsp), %r14
 178         cfi_restore(14)
 179         vmovups 192(%rsp), %zmm2
 180
 181         /* Go to exit */
 182         jmp     L(EXIT)
 183         /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -240; DW_OP_plus)  */
 184         .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x10, 0xff, 0xff, 0xff, 0x22
 185         /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -248; DW_OP_plus)  */
 186         .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x08, 0xff, 0xff, 0xff, 0x22
 187         /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -256; DW_OP_plus)  */
 188         .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x00, 0xff, 0xff, 0xff, 0x22
 189         # LOE rbx r12 r13 r14 r15 zmm2
 190
 191         /* Scalar math fucntion call
 192          * to process special input
 193          */
 194
 195 L(SCALAR_MATH_CALL):
 196         movl    %r12d, %r14d
 197         vmovsd  64(%rsp, %r14, 8), %xmm0
 198         vmovsd  128(%rsp, %r14, 8), %xmm1
 199         call    hypot@PLT
 200         # LOE rbx r14 r15 r12d r13d xmm0
 201
 202         vmovsd  %xmm0, 192(%rsp, %r14, 8)
 203
 204         /* Process special inputs in loop */
 205         jmp     L(SPECIAL_VALUES_LOOP)
 206         # LOE rbx r15 r12d r13d
 207 END(_ZGVeN8vv_hypot_skx)
 208
 209         .section .rodata, "a"
 210         .align  64
 211
 212 #ifdef __svml_dhypot_data_internal_typedef
 213 typedef unsigned int VUINT32;
 214 typedef struct {
 215         __declspec(align(64)) VUINT32 _dAbsMask[8][2];
 216         __declspec(align(64)) VUINT32 _lExpBound_uisa[8][2];
 217         __declspec(align(64)) VUINT32 _lExpBound[8][2];
 218         __declspec(align(64)) VUINT32 _dHalf[8][2];
 219 } __svml_dhypot_data_internal;
 220 #endif
 221 __svml_dhypot_data_internal:
 222         /* legacy algorithm */
 223         .quad   0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff, 0x7fffffffffffffff /* _dAbsMask */
 224         /* fma based algorithm*/
 225         .align  64
 226         .quad   0x407ff00000000000, 0x407ff00000000000, 0x407ff00000000000, 0x407ff00000000000, 0x407ff00000000000, 0x407ff00000000000, 0x407ff00000000000, 0x407ff00000000000 /* _lExpBound_uisa */
 227         .align  64
 228         .quad   0x404f800000000000, 0x404f800000000000, 0x404f800000000000, 0x404f800000000000, 0x404f800000000000, 0x404f800000000000, 0x404f800000000000, 0x404f800000000000 /* _lExpBound */
 229         .align  64
 230         .quad   0x3FE0000000000000, 0x3FE0000000000000, 0x3FE0000000000000, 0x3FE0000000000000, 0x3FE0000000000000, 0x3FE0000000000000, 0x3FE0000000000000, 0x3FE0000000000000 /* _dHalf */
 231         .align  64
 232         .type   __svml_dhypot_data_internal, @object
 233         .size   __svml_dhypot_data_internal, .-__svml_dhypot_data_internal