sysdeps/x86_64/fpu/multiarch/svml_s_atan2f4_core_sse4.S

   1 /* Function atan2f vectorized with SSE4.
   2    Copyright (C) 2021-2023 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    https://www.gnu.org/licenses/.  */
  18
  19 /*
  20  * ALGORITHM DESCRIPTION:
  21  *      For    0.0    <= x <=  7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
  22  *      For  7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
  23  *      For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
  24  *      For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
  25  *      For 39.0/16.0 <= x <=    inf   : atan(x) = atan(inf) + atan(s), where s=-1.0/x
  26  *      Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
  27  *
  28  *
  29  */
  30
  31 /* Offsets for data table __svml_satan2_data_internal
  32  */
  33 #define sZERO                           0
  34 #define sSIGN_MASK                      16
  35 #define sABS_MASK                       32
  36 #define sPIO2                           48
  37 #define sPI                             64
  38 #define sPC8                            80
  39 #define sPC7                            96
  40 #define sPC6                            112
  41 #define sPC5                            128
  42 #define sPC4                            144
  43 #define sPC3                            160
  44 #define sPC2                            176
  45 #define sPC1                            192
  46 #define sPC0                            208
  47 #define iCHK_WORK_SUB                   224
  48 #define iCHK_WORK_CMP                   240
  49
  50 #include <sysdep.h>
  51
  52         .section .text.sse4, "ax", @progbits
  53 ENTRY(_ZGVbN4vv_atan2f_sse4)
  54         subq    $88, %rsp
  55         cfi_def_cfa_offset(96)
  56         movaps  %xmm0, %xmm12
  57
  58         /*
  59          * #define NO_VECTOR_ZERO_ATAN2_ARGS
  60          *  Declarations
  61          * Variables
  62          * Constants
  63          *  The end of declarations
  64          *  Implementation
  65          * Arguments signs
  66          */
  67         movups  sABS_MASK+__svml_satan2_data_internal(%rip), %xmm10
  68         movaps  %xmm1, %xmm13
  69         movaps  %xmm10, %xmm11
  70         andps   %xmm12, %xmm10
  71         andps   %xmm13, %xmm11
  72         movaps  %xmm10, %xmm7
  73         cmpltps %xmm11, %xmm7
  74
  75         /*
  76          * 1) If y<x then a= y, b=x, PIO2=0
  77          * 2) If y>x then a=-x, b=y, PIO2=Pi/2
  78          */
  79         movups  sSIGN_MASK+__svml_satan2_data_internal(%rip), %xmm6
  80         movaps  %xmm7, %xmm0
  81         orps    %xmm11, %xmm6
  82         movaps  %xmm10, %xmm4
  83         andnps  %xmm6, %xmm0
  84         movaps  %xmm7, %xmm6
  85         movaps  %xmm11, %xmm5
  86         andps   %xmm7, %xmm4
  87         andnps  %xmm10, %xmm6
  88         andps   %xmm7, %xmm5
  89         orps    %xmm4, %xmm0
  90         orps    %xmm5, %xmm6
  91
  92         /* Division a/b. */
  93         divps   %xmm6, %xmm0
  94
  95         /* Testing on working interval. */
  96         movdqu  iCHK_WORK_SUB+__svml_satan2_data_internal(%rip), %xmm14
  97         movaps  %xmm11, %xmm15
  98         movaps  %xmm10, %xmm3
  99         psubd   %xmm14, %xmm15
 100         psubd   %xmm14, %xmm3
 101         movdqa  %xmm15, %xmm1
 102         movdqu  iCHK_WORK_CMP+__svml_satan2_data_internal(%rip), %xmm2
 103         movdqa  %xmm3, %xmm14
 104         pcmpgtd %xmm2, %xmm1
 105         pcmpeqd %xmm2, %xmm15
 106         pcmpgtd %xmm2, %xmm14
 107         pcmpeqd %xmm2, %xmm3
 108         por     %xmm15, %xmm1
 109         por     %xmm3, %xmm14
 110         por     %xmm14, %xmm1
 111
 112         /* Polynomial. */
 113         movaps  %xmm0, %xmm14
 114         mulps   %xmm0, %xmm14
 115         movaps  %xmm13, %xmm4
 116         movmskps %xmm1, %ecx
 117         movaps  %xmm14, %xmm15
 118         movaps  %xmm11, %xmm9
 119         mulps   %xmm14, %xmm15
 120         pxor    %xmm13, %xmm9
 121         movups  sPC8+__svml_satan2_data_internal(%rip), %xmm2
 122         movaps  %xmm10, %xmm8
 123         mulps   %xmm15, %xmm2
 124         pxor    %xmm12, %xmm8
 125         movups  sPC7+__svml_satan2_data_internal(%rip), %xmm3
 126         xorl    %edx, %edx
 127         mulps   %xmm15, %xmm3
 128         addps   sPC6+__svml_satan2_data_internal(%rip), %xmm2
 129         mulps   %xmm15, %xmm2
 130         addps   sPC5+__svml_satan2_data_internal(%rip), %xmm3
 131         mulps   %xmm15, %xmm3
 132         addps   sPC4+__svml_satan2_data_internal(%rip), %xmm2
 133         mulps   %xmm15, %xmm2
 134         addps   sPC3+__svml_satan2_data_internal(%rip), %xmm3
 135         mulps   %xmm15, %xmm3
 136         addps   sPC2+__svml_satan2_data_internal(%rip), %xmm2
 137         mulps   %xmm2, %xmm15
 138         addps   sPC1+__svml_satan2_data_internal(%rip), %xmm3
 139         mulps   %xmm3, %xmm14
 140         addps   sPC0+__svml_satan2_data_internal(%rip), %xmm15
 141
 142         /* if x<0, sPI = Pi, else sPI =0 */
 143         movups  __svml_satan2_data_internal(%rip), %xmm5
 144         xorl    %eax, %eax
 145         andnps  sPIO2+__svml_satan2_data_internal(%rip), %xmm7
 146         addps   %xmm14, %xmm15
 147         cmpleps %xmm5, %xmm4
 148
 149         /* Reconstruction. */
 150         mulps   %xmm15, %xmm0
 151         andps   sPI+__svml_satan2_data_internal(%rip), %xmm4
 152         addps   %xmm7, %xmm0
 153         orps    %xmm9, %xmm0
 154         addps   %xmm4, %xmm0
 155         orps    %xmm8, %xmm0
 156
 157         /*  Special branch for fast (vector) processing of zero arguments  */
 158         testl   %ecx, %ecx
 159
 160         /* Go to auxilary branch */
 161         jne     L(AUX_BRANCH)
 162         # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13
 163
 164         /* Return from auxilary branch
 165          * for out of main path inputs
 166          */
 167
 168 L(AUX_BRANCH_RETURN):
 169         /*
 170          *  Special branch for fast (vector) processing of zero arguments
 171          *  The end of implementation
 172          */
 173         testl   %edx, %edx
 174
 175         /* Go to special inputs processing branch */
 176         jne     L(SPECIAL_VALUES_BRANCH)
 177         # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm12 xmm13
 178
 179         /* Restore registers
 180          * and exit the function
 181          */
 182
 183 L(EXIT):
 184         addq    $88, %rsp
 185         cfi_def_cfa_offset(8)
 186         ret
 187         cfi_def_cfa_offset(96)
 188
 189         /* Branch to process
 190          * special inputs
 191          */
 192
 193 L(SPECIAL_VALUES_BRANCH):
 194         movups  %xmm12, 32(%rsp)
 195         movups  %xmm13, 48(%rsp)
 196         movups  %xmm0, 64(%rsp)
 197         # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0
 198
 199         movq    %r12, 16(%rsp)
 200         cfi_offset(12, -80)
 201         movl    %eax, %r12d
 202         movq    %r13, 8(%rsp)
 203         cfi_offset(13, -88)
 204         movl    %edx, %r13d
 205         movq    %r14, (%rsp)
 206         cfi_offset(14, -96)
 207         # LOE rbx rbp r15 r12d r13d
 208
 209         /* Range mask
 210          * bits check
 211          */
 212
 213 L(RANGEMASK_CHECK):
 214         btl     %r12d, %r13d
 215
 216         /* Call scalar math function */
 217         jc      L(SCALAR_MATH_CALL)
 218         # LOE rbx rbp r15 r12d r13d
 219
 220         /* Special inputs
 221          * processing loop
 222          */
 223
 224 L(SPECIAL_VALUES_LOOP):
 225         incl    %r12d
 226         cmpl    $4, %r12d
 227
 228         /* Check bits in range mask */
 229         jl      L(RANGEMASK_CHECK)
 230         # LOE rbx rbp r15 r12d r13d
 231
 232         movq    16(%rsp), %r12
 233         cfi_restore(12)
 234         movq    8(%rsp), %r13
 235         cfi_restore(13)
 236         movq    (%rsp), %r14
 237         cfi_restore(14)
 238         movups  64(%rsp), %xmm0
 239
 240         /* Go to exit */
 241         jmp     L(EXIT)
 242         cfi_offset(12, -80)
 243         cfi_offset(13, -88)
 244         cfi_offset(14, -96)
 245         # LOE rbx rbp r12 r13 r14 r15 xmm0
 246
 247         /* Scalar math fucntion call
 248          * to process special input
 249          */
 250
 251 L(SCALAR_MATH_CALL):
 252         movl    %r12d, %r14d
 253         movss   32(%rsp, %r14, 4), %xmm0
 254         movss   48(%rsp, %r14, 4), %xmm1
 255         call    atan2f@PLT
 256         # LOE rbx rbp r14 r15 r12d r13d xmm0
 257
 258         movss   %xmm0, 64(%rsp, %r14, 4)
 259
 260         /* Process special inputs in loop */
 261         jmp     L(SPECIAL_VALUES_LOOP)
 262         cfi_restore(12)
 263         cfi_restore(13)
 264         cfi_restore(14)
 265         # LOE rbx rbp r15 r12d r13d
 266
 267         /* Auxilary branch
 268          * for out of main path inputs
 269          */
 270
 271 L(AUX_BRANCH):
 272         /* Check if both X & Y are not NaNs:  iXYnotNAN */
 273         movaps  %xmm13, %xmm3
 274         movaps  %xmm12, %xmm2
 275         cmpordps %xmm13, %xmm3
 276         cmpordps %xmm12, %xmm2
 277
 278         /*
 279          *  Path for zero arguments (at least one of both)
 280          * Check if both args are zeros (den. is zero)
 281          */
 282         cmpeqps %xmm5, %xmm6
 283
 284         /* Check if at least on of Y or Y is zero: iAXAYZERO */
 285         pcmpeqd %xmm5, %xmm11
 286         pcmpeqd %xmm5, %xmm10
 287         andps   %xmm2, %xmm3
 288         por     %xmm10, %xmm11
 289
 290         /* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
 291         andps   %xmm3, %xmm11
 292
 293         /* Exclude from previous callout mask zero (and not NaN) arguments */
 294         movaps  %xmm11, %xmm10
 295         pandn   %xmm1, %xmm10
 296
 297         /* Set sPIO2 to zero if den. is zero */
 298         movaps  %xmm6, %xmm1
 299         andnps  %xmm7, %xmm1
 300         andps   %xmm5, %xmm6
 301         orps    %xmm6, %xmm1
 302
 303         /* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */
 304         pcmpgtd %xmm13, %xmm5
 305         orps    %xmm9, %xmm1
 306         andps   %xmm4, %xmm5
 307
 308         /* Merge results from main and spec path */
 309         movaps  %xmm11, %xmm4
 310         addps   %xmm5, %xmm1
 311
 312         /* Go to callout */
 313         movmskps %xmm10, %edx
 314         orps    %xmm8, %xmm1
 315         andnps  %xmm0, %xmm4
 316         andps   %xmm11, %xmm1
 317         movaps  %xmm4, %xmm0
 318         orps    %xmm1, %xmm0
 319
 320         /* Return to main vector processing path */
 321         jmp     L(AUX_BRANCH_RETURN)
 322         # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm12 xmm13
 323 END(_ZGVbN4vv_atan2f_sse4)
 324
 325         .section .rodata, "a"
 326         .align  16
 327
 328 #ifdef __svml_satan2_data_internal_typedef
 329 typedef unsigned int VUINT32;
 330 typedef struct {
 331         __declspec(align(16)) VUINT32 sZERO[4][1];
 332         __declspec(align(16)) VUINT32 sSIGN_MASK[4][1];
 333         __declspec(align(16)) VUINT32 sABS_MASK[4][1];
 334         __declspec(align(16)) VUINT32 sPIO2[4][1];
 335         __declspec(align(16)) VUINT32 sPI[4][1];
 336         __declspec(align(16)) VUINT32 sPC8[4][1];
 337         __declspec(align(16)) VUINT32 sPC7[4][1];
 338         __declspec(align(16)) VUINT32 sPC6[4][1];
 339         __declspec(align(16)) VUINT32 sPC5[4][1];
 340         __declspec(align(16)) VUINT32 sPC4[4][1];
 341         __declspec(align(16)) VUINT32 sPC3[4][1];
 342         __declspec(align(16)) VUINT32 sPC2[4][1];
 343         __declspec(align(16)) VUINT32 sPC1[4][1];
 344         __declspec(align(16)) VUINT32 sPC0[4][1];
 345         __declspec(align(16)) VUINT32 iCHK_WORK_SUB[4][1];
 346         __declspec(align(16)) VUINT32 iCHK_WORK_CMP[4][1];
 347 } __svml_satan2_data_internal;
 348 #endif
 349 __svml_satan2_data_internal:
 350         .long   0x00000000, 0x00000000, 0x00000000, 0x00000000 // sZERO
 351         .align  16
 352         .long   0x80000000, 0x80000000, 0x80000000, 0x80000000 // sSIGN_MASK
 353         .align  16
 354         .long   0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF // sABS_MASK
 355         .align  16
 356         .long   0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB // sPIO2
 357         .align  16
 358         .long   0x40490FDB, 0x40490FDB, 0x40490FDB, 0x40490FDB // sPI
 359         .align  16
 360         .long   0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 // sA08
 361         .align  16
 362         .long   0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 // sA07
 363         .align  16
 364         .long   0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 // sA06
 365         .align  16
 366         .long   0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 // sA05
 367         .align  16
 368         .long   0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 // sA04
 369         .align  16
 370         .long   0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 // sA03
 371         .align  16
 372         .long   0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F // sA02
 373         .align  16
 374         .long   0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 // sA01
 375         .align  16
 376         .long   0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 // sA00
 377         .align  16
 378         .long   0x81000000, 0x81000000, 0x81000000, 0x81000000 // iCHK_WORK_SUB
 379         .align  16
 380         .long   0xFC000000, 0xFC000000, 0xFC000000, 0xFC000000 // iCHK_WORK_CMP
 381         .align  16
 382         .type   __svml_satan2_data_internal, @object
 383         .size   __svml_satan2_data_internal, .-__svml_satan2_data_internal