sysdeps/i386/fpu/s_cbrtl.S

   1 /* Compute cubic root of long double value.
   2    Copyright (C) 1997-2014 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Dirk Alboth <dirka@uni-paderborn.de> and
   5    Ulrich Drepper <drepper@cygnus.com>, 1997.
   6
   7    The GNU C Library is free software; you can redistribute it and/or
   8    modify it under the terms of the GNU Lesser General Public
   9    License as published by the Free Software Foundation; either
  10    version 2.1 of the License, or (at your option) any later version.
  11
  12    The GNU C Library is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    Lesser General Public License for more details.
  16
  17    You should have received a copy of the GNU Lesser General Public
  18    License along with the GNU C Library; if not, see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include <machine/asm.h>
  22
  23         .section .rodata
  24
  25         .align ALIGNARG(4)
  26         .type f8,@object
  27 f8:     .tfloat 0.161617097923756032
  28         ASM_SIZE_DIRECTIVE(f8)
  29         .align ALIGNARG(4)
  30         .type f7,@object
  31 f7:     .tfloat -0.988553671195413709
  32         ASM_SIZE_DIRECTIVE(f7)
  33         .align ALIGNARG(4)
  34         .type f6,@object
  35 f6:     .tfloat 2.65298938441952296
  36         ASM_SIZE_DIRECTIVE(f6)
  37         .align ALIGNARG(4)
  38         .type f5,@object
  39 f5:     .tfloat -4.11151425200350531
  40         ASM_SIZE_DIRECTIVE(f5)
  41         .align ALIGNARG(4)
  42         .type f4,@object
  43 f4:     .tfloat 4.09559907378707839
  44         ASM_SIZE_DIRECTIVE(f4)
  45         .align ALIGNARG(4)
  46         .type f3,@object
  47 f3:     .tfloat -2.82414939754975962
  48         ASM_SIZE_DIRECTIVE(f3)
  49         .align ALIGNARG(4)
  50         .type f2,@object
  51 f2:     .tfloat 1.67595307700780102
  52         ASM_SIZE_DIRECTIVE(f2)
  53         .align ALIGNARG(4)
  54         .type f1,@object
  55 f1:     .tfloat 0.338058687610520237
  56         ASM_SIZE_DIRECTIVE(f1)
  57
  58 #define CBRT2           1.2599210498948731648
  59 #define ONE_CBRT2       0.793700525984099737355196796584
  60 #define SQR_CBRT2       1.5874010519681994748
  61 #define ONE_SQR_CBRT2   0.629960524947436582364439673883
  62
  63         /* We make the entries in the following table all 16 bytes
  64            wide to avoid having to implement a multiplication by 10.  */
  65         .type factor,@object
  66         .align ALIGNARG(4)
  67 factor: .tfloat ONE_SQR_CBRT2
  68         .byte 0, 0, 0, 0, 0, 0
  69         .tfloat ONE_CBRT2
  70         .byte 0, 0, 0, 0, 0, 0
  71         .tfloat 1.0
  72         .byte 0, 0, 0, 0, 0, 0
  73         .tfloat CBRT2
  74         .byte 0, 0, 0, 0, 0, 0
  75         .tfloat SQR_CBRT2
  76         ASM_SIZE_DIRECTIVE(factor)
  77
  78         .type two64,@object
  79         .align ALIGNARG(4)
  80 two64:  .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43
  81         ASM_SIZE_DIRECTIVE(two64)
  82
  83 #ifdef PIC
  84 #define MO(op) op##@GOTOFF(%ebx)
  85 #define MOX(op,x) op##@GOTOFF(%ebx,x,1)
  86 #else
  87 #define MO(op) op
  88 #define MOX(op,x) op(x)
  89 #endif
  90
  91         .text
  92 ENTRY(__cbrtl)
  93         movl    4(%esp), %ecx
  94         movl    12(%esp), %eax
  95         orl     8(%esp), %ecx
  96         movl    %eax, %edx
  97         andl    $0x7fff, %eax
  98         orl     %eax, %ecx
  99         jz      1f
 100         xorl    %ecx, %ecx
 101         cmpl    $0x7fff, %eax
 102         je      1f
 103
 104 #ifdef PIC
 105         pushl   %ebx
 106         cfi_adjust_cfa_offset (4)
 107         cfi_rel_offset (ebx, 0)
 108         LOAD_PIC_REG (bx)
 109 #endif
 110
 111         cmpl    $0, %eax
 112         jne     2f
 113
 114 #ifdef PIC
 115         fldt    8(%esp)
 116 #else
 117         fldt    4(%esp)
 118 #endif
 119         fmull   MO(two64)
 120         movl    $-64, %ecx
 121 #ifdef PIC
 122         fstpt   8(%esp)
 123         movl    16(%esp), %eax
 124 #else
 125         fstpt   4(%esp)
 126         movl    12(%esp), %eax
 127 #endif
 128         movl    %eax, %edx
 129         andl    $0x7fff, %eax
 130
 131 2:      andl    $0x8000, %edx
 132         subl    $16382, %eax
 133         orl     $0x3ffe, %edx
 134         addl    %eax, %ecx
 135 #ifdef PIC
 136         movl    %edx, 16(%esp)
 137
 138         fldt    8(%esp)                 /* xm */
 139 #else
 140         movl    %edx, 12(%esp)
 141
 142         fldt    4(%esp)                 /* xm */
 143 #endif
 144         fabs
 145
 146         /* The following code has two tracks:
 147             a) compute the normalized cbrt value
 148             b) compute xe/3 and xe%3
 149            The right track computes the value for b) and this is done
 150            in an optimized way by avoiding division.
 151
 152            But why two tracks at all?  Very easy: efficiency.  Some FP
 153            instruction can overlap with a certain amount of integer (and
 154            FP) instructions.  So we get (except for the imull) all
 155            instructions for free.  */
 156
 157         fldt    MO(f8)                  /* f8 : xm */
 158         fmul    %st(1)                  /* f8*xm : xm */
 159
 160         fldt    MO(f7)
 161         faddp                           /* f7+f8*xm : xm */
 162         fmul    %st(1)                  /* (f7+f8*xm)*xm : xm */
 163                         movl    $1431655766, %eax
 164         fldt    MO(f6)
 165         faddp                           /* f6+(f7+f8*xm)*xm : xm */
 166                         imull   %ecx
 167         fmul    %st(1)                  /* (f6+(f7+f8*xm)*xm)*xm : xm */
 168                         movl    %ecx, %eax
 169         fldt    MO(f5)
 170         faddp                           /* f5+(f6+(f7+f8*xm)*xm)*xm : xm */
 171                         sarl    $31, %eax
 172         fmul    %st(1)                  /* (f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
 173                         subl    %eax, %edx
 174         fldt    MO(f4)
 175         faddp                           /* f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
 176         fmul    %st(1)                  /* (f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
 177         fldt    MO(f3)
 178         faddp                           /* f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
 179         fmul    %st(1)                  /* (f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
 180         fldt    MO(f2)
 181         faddp                           /* f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
 182         fmul    %st(1)                  /* (f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
 183         fldt    MO(f1)
 184         faddp                           /* u:=f1+(f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
 185
 186         fld     %st                     /* u : u : xm */
 187         fmul    %st(1)                  /* u*u : u : xm */
 188         fld     %st(2)                  /* xm : u*u : u : xm */
 189         fadd    %st                     /* 2*xm : u*u : u : xm */
 190         fxch    %st(1)                  /* u*u : 2*xm : u : xm */
 191         fmul    %st(2)                  /* t2:=u*u*u : 2*xm : u : xm */
 192                         movl    %edx, %eax
 193         fadd    %st, %st(1)             /* t2 : t2+2*xm : u : xm */
 194                         leal    (%edx,%edx,2),%edx
 195         fadd    %st(0)                  /* 2*t2 : t2+2*xm : u : xm */
 196                         subl    %edx, %ecx
 197         faddp   %st, %st(3)             /* t2+2*xm : u : 2*t2+xm */
 198                         shll    $4, %ecx
 199         fmulp                           /* u*(t2+2*xm) : 2*t2+xm */
 200         fdivp   %st, %st(1)             /* u*(t2+2*xm)/(2*t2+xm) */
 201         fldt    MOX(32+factor,%ecx)
 202         fmulp                           /* u*(t2+2*xm)/(2*t2+xm)*FACT */
 203         pushl   %eax
 204         cfi_adjust_cfa_offset (4)
 205         fildl   (%esp)                  /* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
 206         fxch                            /* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
 207         fscale                          /* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
 208         popl    %edx
 209         cfi_adjust_cfa_offset (-4)
 210 #ifdef PIC
 211         movl    16(%esp), %eax
 212         popl    %ebx
 213         cfi_adjust_cfa_offset (-4)
 214         cfi_restore (ebx)
 215 #else
 216         movl    12(%esp), %eax
 217 #endif
 218         testl   $0x8000, %eax
 219         fstp    %st(1)
 220         jz      4f
 221         fchs
 222 4:      ret
 223
 224         /* Return the argument.  */
 225 1:      fldt    4(%esp)
 226         ret
 227 END(__cbrtl)
 228 weak_alias (__cbrtl, cbrtl)