sysdeps/i386/fpu/s_cbrtl.S

   1 /* Compute cubic root of long double value.
   2    Copyright (C) 1997, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Dirk Alboth <dirka@uni-paderborn.de> and
   5    Ulrich Drepper <drepper@cygnus.com>, 1997.
   6
   7    The GNU C Library is free software; you can redistribute it and/or
   8    modify it under the terms of the GNU Lesser General Public
   9    License as published by the Free Software Foundation; either
  10    version 2.1 of the License, or (at your option) any later version.
  11
  12    The GNU C Library is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    Lesser General Public License for more details.
  16
  17    You should have received a copy of the GNU Lesser General Public
  18    License along with the GNU C Library; if not, write to the Free
  19    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  20    02111-1307 USA.  */
  21
  22 #include <machine/asm.h>
  23
  24 #ifdef __ELF__
  25         .section .rodata
  26 #else
  27         .text
  28 #endif
  29
  30         .align ALIGNARG(4)
  31         ASM_TYPE_DIRECTIVE(f8,@object)
  32 f8:     .tfloat 0.161617097923756032
  33         ASM_SIZE_DIRECTIVE(f8)
  34         .align ALIGNARG(4)
  35         ASM_TYPE_DIRECTIVE(f7,@object)
  36 f7:     .tfloat -0.988553671195413709
  37         ASM_SIZE_DIRECTIVE(f7)
  38         .align ALIGNARG(4)
  39         ASM_TYPE_DIRECTIVE(f6,@object)
  40 f6:     .tfloat 2.65298938441952296
  41         ASM_SIZE_DIRECTIVE(f6)
  42         .align ALIGNARG(4)
  43         ASM_TYPE_DIRECTIVE(f5,@object)
  44 f5:     .tfloat -4.11151425200350531
  45         ASM_SIZE_DIRECTIVE(f5)
  46         .align ALIGNARG(4)
  47         ASM_TYPE_DIRECTIVE(f4,@object)
  48 f4:     .tfloat 4.09559907378707839
  49         ASM_SIZE_DIRECTIVE(f4)
  50         .align ALIGNARG(4)
  51         ASM_TYPE_DIRECTIVE(f3,@object)
  52 f3:     .tfloat -2.82414939754975962
  53         ASM_SIZE_DIRECTIVE(f3)
  54         .align ALIGNARG(4)
  55         ASM_TYPE_DIRECTIVE(f2,@object)
  56 f2:     .tfloat 1.67595307700780102
  57         ASM_SIZE_DIRECTIVE(f2)
  58         .align ALIGNARG(4)
  59         ASM_TYPE_DIRECTIVE(f1,@object)
  60 f1:     .tfloat 0.338058687610520237
  61         ASM_SIZE_DIRECTIVE(f1)
  62
  63 #define CBRT2           1.2599210498948731648
  64 #define ONE_CBRT2       0.793700525984099737355196796584
  65 #define SQR_CBRT2       1.5874010519681994748
  66 #define ONE_SQR_CBRT2   0.629960524947436582364439673883
  67
  68         /* We make the entries in the following table all 16 bytes
  69            wide to avoid having to implement a multiplication by 10.  */
  70         ASM_TYPE_DIRECTIVE(factor,@object)
  71         .align ALIGNARG(4)
  72 factor: .tfloat ONE_SQR_CBRT2
  73         .byte 0, 0, 0, 0, 0, 0
  74         .tfloat ONE_CBRT2
  75         .byte 0, 0, 0, 0, 0, 0
  76         .tfloat 1.0
  77         .byte 0, 0, 0, 0, 0, 0
  78         .tfloat CBRT2
  79         .byte 0, 0, 0, 0, 0, 0
  80         .tfloat SQR_CBRT2
  81         ASM_SIZE_DIRECTIVE(factor)
  82
  83         ASM_TYPE_DIRECTIVE(two64,@object)
  84         .align ALIGNARG(4)
  85 two64:  .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43
  86         ASM_SIZE_DIRECTIVE(two64)
  87
  88 #ifdef PIC
  89 #define MO(op) op##@GOTOFF(%ebx)
  90 #define MOX(op,x) op##@GOTOFF(%ebx,x,1)
  91 #else
  92 #define MO(op) op
  93 #define MOX(op,x) op(x)
  94 #endif
  95
  96         .text
  97 ENTRY(__cbrtl)
  98         movl    4(%esp), %ecx
  99         movl    12(%esp), %eax
 100         orl     8(%esp), %ecx
 101         movl    %eax, %edx
 102         andl    $0x7fff, %eax
 103         orl     %eax, %ecx
 104         jz      1f
 105         xorl    %ecx, %ecx
 106         cmpl    $0x7fff, %eax
 107         je      1f
 108
 109 #ifdef PIC
 110         pushl   %ebx
 111         cfi_adjust_cfa_offset (4)
 112         cfi_rel_offset (ebx, 0)
 113         LOAD_PIC_REG (bx)
 114 #endif
 115
 116         cmpl    $0, %eax
 117         jne     2f
 118
 119 #ifdef PIC
 120         fldt    8(%esp)
 121 #else
 122         fldt    4(%esp)
 123 #endif
 124         fmull   MO(two64)
 125         movl    $-64, %ecx
 126 #ifdef PIC
 127         fstpt   8(%esp)
 128         movl    16(%esp), %eax
 129 #else
 130         fstpt   4(%esp)
 131         movl    12(%esp), %eax
 132 #endif
 133         movl    %eax, %edx
 134         andl    $0x7fff, %eax
 135
 136 2:      andl    $0x8000, %edx
 137         subl    $16382, %eax
 138         orl     $0x3ffe, %edx
 139         addl    %eax, %ecx
 140 #ifdef PIC
 141         movl    %edx, 16(%esp)
 142
 143         fldt    8(%esp)                 /* xm */
 144 #else
 145         movl    %edx, 12(%esp)
 146
 147         fldt    4(%esp)                 /* xm */
 148 #endif
 149         fabs
 150
 151         /* The following code has two tracks:
 152             a) compute the normalized cbrt value
 153             b) compute xe/3 and xe%3
 154            The right track computes the value for b) and this is done
 155            in an optimized way by avoiding division.
 156
 157            But why two tracks at all?  Very easy: efficiency.  Some FP
 158            instruction can overlap with a certain amount of integer (and
 159            FP) instructions.  So we get (except for the imull) all
 160            instructions for free.  */
 161
 162         fldt    MO(f8)                  /* f8 : xm */
 163         fmul    %st(1)                  /* f8*xm : xm */
 164
 165         fldt    MO(f7)
 166         faddp                           /* f7+f8*xm : xm */
 167         fmul    %st(1)                  /* (f7+f8*xm)*xm : xm */
 168                         movl    $1431655766, %eax
 169         fldt    MO(f6)
 170         faddp                           /* f6+(f7+f8*xm)*xm : xm */
 171                         imull   %ecx
 172         fmul    %st(1)                  /* (f6+(f7+f8*xm)*xm)*xm : xm */
 173                         movl    %ecx, %eax
 174         fldt    MO(f5)
 175         faddp                           /* f5+(f6+(f7+f8*xm)*xm)*xm : xm */
 176                         sarl    $31, %eax
 177         fmul    %st(1)                  /* (f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
 178                         subl    %eax, %edx
 179         fldt    MO(f4)
 180         faddp                           /* f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
 181         fmul    %st(1)                  /* (f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
 182         fldt    MO(f3)
 183         faddp                           /* f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
 184         fmul    %st(1)                  /* (f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
 185         fldt    MO(f2)
 186         faddp                           /* f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
 187         fmul    %st(1)                  /* (f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
 188         fldt    MO(f1)
 189         faddp                           /* u:=f1+(f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
 190
 191         fld     %st                     /* u : u : xm */
 192         fmul    %st(1)                  /* u*u : u : xm */
 193         fld     %st(2)                  /* xm : u*u : u : xm */
 194         fadd    %st                     /* 2*xm : u*u : u : xm */
 195         fxch    %st(1)                  /* u*u : 2*xm : u : xm */
 196         fmul    %st(2)                  /* t2:=u*u*u : 2*xm : u : xm */
 197                         movl    %edx, %eax
 198         fadd    %st, %st(1)             /* t2 : t2+2*xm : u : xm */
 199                         leal    (%edx,%edx,2),%edx
 200         fadd    %st(0)                  /* 2*t2 : t2+2*xm : u : xm */
 201                         subl    %edx, %ecx
 202         faddp   %st, %st(3)             /* t2+2*xm : u : 2*t2+xm */
 203                         shll    $4, %ecx
 204         fmulp                           /* u*(t2+2*xm) : 2*t2+xm */
 205         fdivp   %st, %st(1)             /* u*(t2+2*xm)/(2*t2+xm) */
 206         fldt    MOX(32+factor,%ecx)
 207         fmulp                           /* u*(t2+2*xm)/(2*t2+xm)*FACT */
 208         pushl   %eax
 209         cfi_adjust_cfa_offset (4)
 210         fildl   (%esp)                  /* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
 211         fxch                            /* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
 212         fscale                          /* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
 213         popl    %edx
 214         cfi_adjust_cfa_offset (-4)
 215 #ifdef PIC
 216         movl    16(%esp), %eax
 217         popl    %ebx
 218         cfi_adjust_cfa_offset (-4)
 219         cfi_restore (ebx)
 220 #else
 221         movl    12(%esp), %eax
 222 #endif
 223         testl   $0x8000, %eax
 224         fstp    %st(1)
 225         jz      4f
 226         fchs
 227 4:      ret
 228
 229         /* Return the argument.  */
 230 1:      fldt    4(%esp)
 231         ret
 232 END(__cbrtl)
 233 weak_alias (__cbrtl, cbrtl)