sysdeps/i386/fpu/s_cbrtl.S

   1 /* Compute cubic root of long double value.
   2    Copyright (C) 1997-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <libm-alias-ldouble.h>
  20 #include <machine/asm.h>
  21
  22         .section .rodata
  23
  24         .align ALIGNARG(4)
  25         .type f8,@object
  26 f8:     .quad   0xa57ef3d83a542839  /* 0.161617097923756032  */
  27         .short  0x3ffc
  28         ASM_SIZE_DIRECTIVE(f8)
  29         .align ALIGNARG(4)
  30         .type f7,@object
  31 f7:     .quad   0xfd11da7820029014  /* -0.988553671195413709  */
  32         .short  0xbffe
  33         ASM_SIZE_DIRECTIVE(f7)
  34         .align ALIGNARG(4)
  35         .type f6,@object
  36 f6:     .quad   0xa9ca93fcade3b4ad  /* 2.65298938441952296  */
  37         .short  0x4000
  38         ASM_SIZE_DIRECTIVE(f6)
  39         .align ALIGNARG(4)
  40         .type f5,@object
  41 f5:     .quad   0x839186562c931c34  /* -4.11151425200350531  */
  42         .short  0xc001
  43         ASM_SIZE_DIRECTIVE(f5)
  44         .align ALIGNARG(4)
  45         .type f4,@object
  46 f4:     .quad   0x830f25c9ee304594  /* 4.09559907378707839  */
  47         .short  0x4001
  48         ASM_SIZE_DIRECTIVE(f4)
  49         .align ALIGNARG(4)
  50         .type f3,@object
  51 f3:     .quad   0xb4bedd1d5fa2f0c6  /* -2.82414939754975962  */
  52         .short  0xc000
  53         ASM_SIZE_DIRECTIVE(f3)
  54         .align ALIGNARG(4)
  55         .type f2,@object
  56 f2:     .quad   0xd685a163b08586e3  /* 1.67595307700780102  */
  57         .short  0x3fff
  58         ASM_SIZE_DIRECTIVE(f2)
  59         .align ALIGNARG(4)
  60         .type f1,@object
  61 f1:     .quad   0xad16073ed4ec3b45  /* 0.338058687610520237  */
  62         .short  0x3ffd
  63         ASM_SIZE_DIRECTIVE(f1)
  64
  65         /* We make the entries in the following table all 16 bytes
  66            wide to avoid having to implement a multiplication by 10.  */
  67         .type factor,@object
  68         .align ALIGNARG(4)
  69 factor: /* 1.0 / cbrt (2.0) ^ 2 = 0.629960524947436582364439673883  */
  70         .quad 0xa14517cc6b945711
  71         .short 0x3ffe
  72         .byte 0, 0, 0, 0, 0, 0
  73         /* 1.0 / cbrt (2.0) = 0.793700525984099737355196796584  */
  74         .quad 0xcb2ff529eb71e415
  75         .short 0x3ffe
  76         .byte 0, 0, 0, 0, 0, 0
  77         /* 1.0L  */
  78         .quad 0x8000000000000000
  79         .short 0x3fff
  80         .byte 0, 0, 0, 0, 0, 0
  81         /* cbrt (2.0) = 1.2599210498948731648  */
  82         .quad 0xa14517cc6b945711
  83         .short 0x3fff
  84         .byte 0, 0, 0, 0, 0, 0
  85         /* cbrt (2.0) ^ 2 = 1.5874010519681994748  */
  86         .quad 0xcb2ff529eb71e416
  87         .short 0x3fff
  88         ASM_SIZE_DIRECTIVE(factor)
  89
  90         .type two64,@object
  91         .align ALIGNARG(4)
  92 two64:  .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43
  93         ASM_SIZE_DIRECTIVE(two64)
  94
  95 #ifdef PIC
  96 #define MO(op) op##@GOTOFF(%ebx)
  97 #define MOX(op,x) op##@GOTOFF(%ebx,x,1)
  98 #else
  99 #define MO(op) op
 100 #define MOX(op,x) op(x)
 101 #endif
 102
 103         .text
 104 ENTRY(__cbrtl)
 105         movl    4(%esp), %ecx
 106         movl    12(%esp), %eax
 107         orl     8(%esp), %ecx
 108         movl    %eax, %edx
 109         andl    $0x7fff, %eax
 110         orl     %eax, %ecx
 111         jz      1f
 112         xorl    %ecx, %ecx
 113         cmpl    $0x7fff, %eax
 114         je      1f
 115
 116 #ifdef PIC
 117         pushl   %ebx
 118         cfi_adjust_cfa_offset (4)
 119         cfi_rel_offset (ebx, 0)
 120         LOAD_PIC_REG (bx)
 121 #endif
 122
 123         cmpl    $0, %eax
 124         jne     2f
 125
 126 #ifdef PIC
 127         fldt    8(%esp)
 128 #else
 129         fldt    4(%esp)
 130 #endif
 131         fmull   MO(two64)
 132         movl    $-64, %ecx
 133 #ifdef PIC
 134         fstpt   8(%esp)
 135         movl    16(%esp), %eax
 136 #else
 137         fstpt   4(%esp)
 138         movl    12(%esp), %eax
 139 #endif
 140         movl    %eax, %edx
 141         andl    $0x7fff, %eax
 142
 143 2:      andl    $0x8000, %edx
 144         subl    $16382, %eax
 145         orl     $0x3ffe, %edx
 146         addl    %eax, %ecx
 147 #ifdef PIC
 148         movl    %edx, 16(%esp)
 149
 150         fldt    8(%esp)                 /* xm */
 151 #else
 152         movl    %edx, 12(%esp)
 153
 154         fldt    4(%esp)                 /* xm */
 155 #endif
 156         fabs
 157
 158         /* The following code has two tracks:
 159             a) compute the normalized cbrt value
 160             b) compute xe/3 and xe%3
 161            The right track computes the value for b) and this is done
 162            in an optimized way by avoiding division.
 163
 164            But why two tracks at all?  Very easy: efficiency.  Some FP
 165            instruction can overlap with a certain amount of integer (and
 166            FP) instructions.  So we get (except for the imull) all
 167            instructions for free.  */
 168
 169         fldt    MO(f8)                  /* f8 : xm */
 170         fmul    %st(1)                  /* f8*xm : xm */
 171
 172         fldt    MO(f7)
 173         faddp                           /* f7+f8*xm : xm */
 174         fmul    %st(1)                  /* (f7+f8*xm)*xm : xm */
 175                         movl    $1431655766, %eax
 176         fldt    MO(f6)
 177         faddp                           /* f6+(f7+f8*xm)*xm : xm */
 178                         imull   %ecx
 179         fmul    %st(1)                  /* (f6+(f7+f8*xm)*xm)*xm : xm */
 180                         movl    %ecx, %eax
 181         fldt    MO(f5)
 182         faddp                           /* f5+(f6+(f7+f8*xm)*xm)*xm : xm */
 183                         sarl    $31, %eax
 184         fmul    %st(1)                  /* (f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
 185                         subl    %eax, %edx
 186         fldt    MO(f4)
 187         faddp                           /* f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
 188         fmul    %st(1)                  /* (f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
 189         fldt    MO(f3)
 190         faddp                           /* f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
 191         fmul    %st(1)                  /* (f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
 192         fldt    MO(f2)
 193         faddp                           /* f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
 194         fmul    %st(1)                  /* (f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
 195         fldt    MO(f1)
 196         faddp                           /* u:=f1+(f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
 197
 198         fld     %st                     /* u : u : xm */
 199         fmul    %st(1)                  /* u*u : u : xm */
 200         fld     %st(2)                  /* xm : u*u : u : xm */
 201         fadd    %st                     /* 2*xm : u*u : u : xm */
 202         fxch    %st(1)                  /* u*u : 2*xm : u : xm */
 203         fmul    %st(2)                  /* t2:=u*u*u : 2*xm : u : xm */
 204                         movl    %edx, %eax
 205         fadd    %st, %st(1)             /* t2 : t2+2*xm : u : xm */
 206                         leal    (%edx,%edx,2),%edx
 207         fadd    %st(0)                  /* 2*t2 : t2+2*xm : u : xm */
 208                         subl    %edx, %ecx
 209         faddp   %st, %st(3)             /* t2+2*xm : u : 2*t2+xm */
 210                         shll    $4, %ecx
 211         fmulp                           /* u*(t2+2*xm) : 2*t2+xm */
 212         fdivp   %st, %st(1)             /* u*(t2+2*xm)/(2*t2+xm) */
 213         fldt    MOX(32+factor,%ecx)
 214         fmulp                           /* u*(t2+2*xm)/(2*t2+xm)*FACT */
 215         pushl   %eax
 216         cfi_adjust_cfa_offset (4)
 217         fildl   (%esp)                  /* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
 218         fxch                            /* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
 219         fscale                          /* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
 220         popl    %edx
 221         cfi_adjust_cfa_offset (-4)
 222 #ifdef PIC
 223         movl    16(%esp), %eax
 224         popl    %ebx
 225         cfi_adjust_cfa_offset (-4)
 226         cfi_restore (ebx)
 227 #else
 228         movl    12(%esp), %eax
 229 #endif
 230         testl   $0x8000, %eax
 231         fstp    %st(1)
 232         jz      4f
 233         fchs
 234 4:      ret
 235
 236         /* Return the argument.  */
 237 1:      fldt    4(%esp)
 238         fadd    %st
 239         ret
 240 END(__cbrtl)
 241 libm_alias_ldouble (__cbrt, cbrt)