sysdeps/i386/fpu/s_cbrtf.S

   1 /* Compute cubic root of float value.
   2    Copyright (C) 1997, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Dirk Alboth <dirka@uni-paderborn.de> and
   5    Ulrich Drepper <drepper@cygnus.com>, 1997.
   6
   7    The GNU C Library is free software; you can redistribute it and/or
   8    modify it under the terms of the GNU Lesser General Public
   9    License as published by the Free Software Foundation; either
  10    version 2.1 of the License, or (at your option) any later version.
  11
  12    The GNU C Library is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    Lesser General Public License for more details.
  16
  17    You should have received a copy of the GNU Lesser General Public
  18    License along with the GNU C Library; if not, write to the Free
  19    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  20    02111-1307 USA.  */
  21
  22 #include <machine/asm.h>
  23
  24 #ifdef __ELF__
  25         .section .rodata
  26 #else
  27         .text
  28 #endif
  29
  30         .align ALIGNARG(4)
  31         ASM_TYPE_DIRECTIVE(f3,@object)
  32 f3:     .double 0.191502161678719066
  33         ASM_SIZE_DIRECTIVE(f3)
  34         ASM_TYPE_DIRECTIVE(f2,@object)
  35 f2:     .double 0.697570460207922770
  36         ASM_SIZE_DIRECTIVE(f2)
  37         ASM_TYPE_DIRECTIVE(f1,@object)
  38 f1:     .double 0.492659620528969547
  39         ASM_SIZE_DIRECTIVE(f1)
  40
  41 #define CBRT2           1.2599210498948731648
  42 #define ONE_CBRT2       0.793700525984099737355196796584
  43 #define SQR_CBRT2       1.5874010519681994748
  44 #define ONE_SQR_CBRT2   0.629960524947436582364439673883
  45
  46         ASM_TYPE_DIRECTIVE(factor,@object)
  47         .align ALIGNARG(4)
  48 factor: .double ONE_SQR_CBRT2
  49         .double ONE_CBRT2
  50         .double 1.0
  51         .double CBRT2
  52         .double SQR_CBRT2
  53         ASM_SIZE_DIRECTIVE(factor)
  54
  55         ASM_TYPE_DIRECTIVE(two25,@object)
  56 two25:  .byte 0, 0, 0, 0x4c
  57         ASM_SIZE_DIRECTIVE(two25)
  58
  59 #ifdef PIC
  60 #define MO(op) op##@GOTOFF(%ebx)
  61 #define MOX(op,x) op##@GOTOFF(%ebx,x,1)
  62 #else
  63 #define MO(op) op
  64 #define MOX(op,x) op(x)
  65 #endif
  66
  67         .text
  68 ENTRY(__cbrtf)
  69         movl    4(%esp), %eax
  70         xorl    %ecx, %ecx
  71         movl    %eax, %edx
  72         andl    $0x7fffffff, %eax
  73         jz      1f
  74         cmpl    $0x7f800000, %eax
  75         jae     1f
  76
  77 #ifdef PIC
  78         pushl   %ebx
  79         cfi_adjust_cfa_offset (4)
  80         cfi_rel_offset (ebx, 0)
  81         LOAD_PIC_REG (bx)
  82 #endif
  83
  84         cmpl    $0x00800000, %eax
  85         jae     2f
  86
  87 #ifdef PIC
  88         flds    8(%esp)
  89 #else
  90         flds    4(%esp)
  91 #endif
  92         fmuls   MO(two25)
  93         movl    $-25, %ecx
  94 #ifdef PIC
  95         fstps   8(%esp)
  96         movl    8(%esp), %eax
  97 #else
  98         fstps   4(%esp)
  99         movl    4(%esp), %eax
 100 #endif
 101         movl    %eax, %edx
 102         andl    $0x7fffffff, %eax
 103
 104 2:      shrl    $23, %eax
 105         andl    $0x807fffff, %edx
 106         subl    $126, %eax
 107         orl     $0x3f000000, %edx
 108         addl    %eax, %ecx
 109 #ifdef PIC
 110         movl    %edx, 8(%esp)
 111
 112         flds    8(%esp)                 /* xm */
 113 #else
 114         movl    %edx, 4(%esp)
 115
 116         flds    4(%esp)                 /* xm */
 117 #endif
 118         fabs
 119
 120         /* The following code has two tracks:
 121             a) compute the normalized cbrt value
 122             b) compute xe/3 and xe%3
 123            The right track computes the value for b) and this is done
 124            in an optimized way by avoiding division.
 125
 126            But why two tracks at all?  Very easy: efficiency.  Some FP
 127            instruction can overlap with a certain amount of integer (and
 128            FP) instructions.  So we get (except for the imull) all
 129            instructions for free.  */
 130
 131         fld     %st(0)                  /* xm : xm */
 132         fmull   MO(f3)                  /* f3*xm : xm */
 133                         movl    $1431655766, %eax
 134         fsubrl  MO(f2)                  /* f2-f3*xm : xm */
 135                         imull   %ecx
 136         fmul    %st(1)                  /* (f2-f3*xm)*xm : xm */
 137                         movl    %ecx, %eax
 138         faddl   MO(f1)                  /* u:=f1+(f2-f3*xm)*xm : xm */
 139                         sarl    $31, %eax
 140         fld     %st                     /* u : u : xm */
 141                         subl    %eax, %edx
 142         fmul    %st(1)                  /* u*u : u : xm */
 143         fld     %st(2)                  /* xm : u*u : u : xm */
 144         fadd    %st                     /* 2*xm : u*u : u : xm */
 145         fxch    %st(1)                  /* u*u : 2*xm : u : xm */
 146         fmul    %st(2)                  /* t2:=u*u*u : 2*xm : u : xm */
 147                         movl    %edx, %eax
 148         fadd    %st, %st(1)             /* t2 : t2+2*xm : u : xm */
 149                         leal    (%edx,%edx,2),%edx
 150         fadd    %st(0)                  /* 2*t2 : t2+2*xm : u : xm */
 151                         subl    %edx, %ecx
 152         faddp   %st, %st(3)             /* t2+2*xm : u : 2*t2+xm */
 153                         shll    $3, %ecx
 154         fmulp                           /* u*(t2+2*xm) : 2*t2+xm */
 155         fdivp   %st, %st(1)             /* u*(t2+2*xm)/(2*t2+xm) */
 156         fmull   MOX(16+factor,%ecx)     /* u*(t2+2*xm)/(2*t2+xm)*FACT */
 157         pushl   %eax
 158         cfi_adjust_cfa_offset (4)
 159         fildl   (%esp)                  /* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
 160         fxch                            /* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
 161         fscale                          /* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
 162         popl    %edx
 163         cfi_adjust_cfa_offset (-4)
 164 #ifdef PIC
 165         movl    8(%esp), %eax
 166         popl    %ebx
 167         cfi_adjust_cfa_offset (-4)
 168         cfi_restore (ebx)
 169 #else
 170         movl    4(%esp), %eax
 171 #endif
 172         testl   %eax, %eax
 173         fstp    %st(1)
 174         jns     4f
 175         fchs
 176 4:      ret
 177
 178         /* Return the argument.  */
 179 1:      flds    4(%esp)
 180         ret
 181 END(__cbrtf)
 182 weak_alias (__cbrtf, cbrtf)