sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S

   1 /* Optimized with sse2 version of cosf
   2    Copyright (C) 2012-2014 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20 #define __need_Emath
  21 #include <bits/errno.h>
  22
  23 /* Short algorithm description:
  24  *
  25  *  1) if |x| == 0: return 1.0-|x|.
  26  *  2) if |x| <  2^-27: return 1.0-|x|.
  27  *  3) if |x| <  2^-5 : return 1.0+x^2*DP_COS2_0+x^5*DP_COS2_1.
  28  *  4) if |x| <   Pi/4: return 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).
  29  *  5) if |x| < 9*Pi/4:
  30  *      5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+3,
  31  *           t=|x|-j*Pi/4.
  32  *      5.2) Reconstruction:
  33  *          s = (-1.0)^((n>>2)&1)
  34  *          if(n&2 != 0) {
  35  *              using cos(t) polynomial for |t|<Pi/4, result is
  36  *              s     * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))).
  37  *          } else {
  38  *              using sin(t) polynomial for |t|<Pi/4, result is
  39  *              s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))).
  40  *          }
  41  *  6) if |x| < 2^23, large args:
  42  *      6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
  43  *           t=|x|-j*Pi/4.
  44  *      6.2) Reconstruction same as (5.2).
  45  *  7) if |x| >= 2^23, very large args:
  46  *      7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3,
  47  *           t=|x|-j*Pi/4.
  48  *      7.2) Reconstruction same as (5.2).
  49  *  8) if x is Inf, return x-x, and set errno=EDOM.
  50  *  9) if x is NaN, return x-x.
  51  *
  52  * Special cases:
  53  *  cos(+-0) = 1 not raising inexact,
  54  *  cos(subnormal) raises inexact,
  55  *  cos(min_normalized) raises inexact,
  56  *  cos(normalized) raises inexact,
  57  *  cos(Inf) = NaN, raises invalid, sets errno to EDOM,
  58  *  cos(NaN) = NaN.
  59  */
  60
  61 #ifdef  PIC
  62 # define MO1(symbol)                    L(symbol)##@GOTOFF(%ebx)
  63 # define MO2(symbol,reg2,_scale)        L(symbol)##@GOTOFF(%ebx,reg2,_scale)
  64 # define CFI_PUSH(REG)  cfi_adjust_cfa_offset(4); cfi_rel_offset(REG,0)
  65 # define CFI_POP(REG)   cfi_adjust_cfa_offset(-4); cfi_restore(REG)
  66 # define PUSH(REG)                      pushl REG; CFI_PUSH(REG)
  67 # define POP(REG)                       popl REG; CFI_POP(REG)
  68 # define ENTRANCE                       PUSH(%ebx); LOAD_PIC_REG(bx)
  69 # define RETURN                         POP(%ebx); ret; CFI_PUSH(%ebx)
  70 # define ARG_X                          8(%esp)
  71 #else
  72 # define MO1(symbol)                    L(symbol)
  73 # define MO2(symbol,reg2,_scale)        L(symbol)(,reg2,_scale)
  74 # define ENTRANCE
  75 # define RETURN                         ret
  76 # define ARG_X                          4(%esp)
  77 #endif
  78
  79         .text
  80 ENTRY(__cosf_sse2)
  81         /* Input: single precision x on stack at address ARG_X */
  82
  83         ENTRANCE
  84         movl    ARG_X, %eax             /* Bits of x */
  85         cvtss2sd ARG_X, %xmm0           /* DP x */
  86         andl    $0x7fffffff, %eax       /* |x| */
  87
  88         cmpl    $0x3f490fdb, %eax       /* |x|<Pi/4?  */
  89         jb      L(arg_less_pio4)
  90
  91         /* Here if |x|>=Pi/4 */
  92         movd    %eax, %xmm3             /* SP |x| */
  93         andpd   MO1(DP_ABS_MASK),%xmm0  /* DP |x| */
  94         movss   MO1(SP_INVPIO4), %xmm2  /* SP 1/(Pi/4) */
  95
  96         cmpl    $0x40e231d6, %eax       /* |x|<9*Pi/4?  */
  97         jae     L(large_args)
  98
  99         /* Here if Pi/4<=|x|<9*Pi/4 */
 100         mulss   %xmm3, %xmm2            /* SP |x|/(Pi/4) */
 101         cvttss2si %xmm2, %eax           /* k, number of Pi/4 in x */
 102         addl    $1, %eax                /* k+1 */
 103         movl    $0x0e, %edx
 104         andl    %eax, %edx              /* j = (k+1)&0x0e */
 105         addl    $2, %eax                /* n */
 106         subsd   MO2(PIO4J,%edx,8), %xmm0 /* t = |x| - j * Pi/4 */
 107
 108 L(reconstruction):
 109         /* Input: %eax=n, %xmm0=t */
 110         testl   $2, %eax                /* n&2 != 0?  */
 111         jz      L(sin_poly)
 112
 113 /*L(cos_poly):*/
 114         /* Here if cos(x) calculated using cos(t) polynomial for |t|<Pi/4:
 115          * y = t*t; z = y*y;
 116          * s = sign(x) * (-1.0)^((n>>2)&1)
 117          * result = s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))))
 118          */
 119         shrl    $2, %eax                /* n>>2 */
 120         mulsd   %xmm0, %xmm0            /* y=t^2 */
 121         andl    $1, %eax                /* (n>>2)&1 */
 122         movaps  %xmm0, %xmm1            /* y */
 123         mulsd   %xmm0, %xmm0            /* z=t^4 */
 124
 125         movsd   MO1(DP_C4), %xmm4       /* C4 */
 126         mulsd   %xmm0, %xmm4            /* z*C4 */
 127         movsd   MO1(DP_C3), %xmm3       /* C3 */
 128         mulsd   %xmm0, %xmm3            /* z*C3 */
 129         addsd   MO1(DP_C2), %xmm4       /* C2+z*C4 */
 130         mulsd   %xmm0, %xmm4            /* z*(C2+z*C4) */
 131         lea     -8(%esp), %esp          /* Borrow 4 bytes of stack frame */
 132         addsd   MO1(DP_C1), %xmm3       /* C1+z*C3 */
 133         mulsd   %xmm0, %xmm3            /* z*(C1+z*C3) */
 134         addsd   MO1(DP_C0), %xmm4       /* C0+z*(C2+z*C4) */
 135         mulsd   %xmm1, %xmm4            /* y*(C0+z*(C2+z*C4)) */
 136
 137         addsd   %xmm4, %xmm3            /* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
 138         /* 1.0+y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
 139         addsd   MO1(DP_ONES), %xmm3
 140
 141         mulsd   MO2(DP_ONES,%eax,8), %xmm3 /* DP result */
 142         movsd   %xmm3, 0(%esp)          /* Move result from sse...  */
 143         fldl    0(%esp)                 /* ...to FPU.  */
 144         /* Return back 4 bytes of stack frame */
 145         lea     8(%esp), %esp
 146         RETURN
 147
 148         .p2align        4
 149 L(sin_poly):
 150         /* Here if cos(x) calculated using sin(t) polynomial for |t|<Pi/4:
 151          * y = t*t; z = y*y;
 152          * s = sign(x) * (-1.0)^((n>>2)&1)
 153          * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))))
 154          */
 155
 156         movaps  %xmm0, %xmm4            /* t */
 157         shrl    $2, %eax                /* n>>2 */
 158         mulsd   %xmm0, %xmm0            /* y=t^2 */
 159         andl    $1, %eax                /* (n>>2)&1 */
 160         movaps  %xmm0, %xmm1            /* y */
 161         mulsd   %xmm0, %xmm0            /* z=t^4 */
 162
 163         movsd   MO1(DP_S4), %xmm2       /* S4 */
 164         mulsd   %xmm0, %xmm2            /* z*S4 */
 165         movsd   MO1(DP_S3), %xmm3       /* S3 */
 166         mulsd   %xmm0, %xmm3            /* z*S3 */
 167         lea     -8(%esp), %esp          /* Borrow 4 bytes of stack frame */
 168         addsd   MO1(DP_S2), %xmm2       /* S2+z*S4 */
 169         mulsd   %xmm0, %xmm2            /* z*(S2+z*S4) */
 170         addsd   MO1(DP_S1), %xmm3       /* S1+z*S3 */
 171         mulsd   %xmm0, %xmm3            /* z*(S1+z*S3) */
 172         addsd   MO1(DP_S0), %xmm2       /* S0+z*(S2+z*S4) */
 173         mulsd   %xmm1, %xmm2            /* y*(S0+z*(S2+z*S4)) */
 174         /* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */
 175         mulsd   MO2(DP_ONES,%eax,8), %xmm4
 176         addsd   %xmm2, %xmm3            /* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
 177         /* t*s*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
 178         mulsd   %xmm4, %xmm3
 179         /* t*s*(1.0+y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */
 180         addsd   %xmm4, %xmm3
 181         movsd   %xmm3, 0(%esp)          /* Move result from sse...   */
 182         fldl    0(%esp)                 /* ...to FPU.  */
 183         /* Return back 4 bytes of stack frame */
 184         lea     8(%esp), %esp
 185         RETURN
 186
 187         .p2align        4
 188 L(large_args):
 189         /* Here if |x|>=9*Pi/4 */
 190         cmpl    $0x7f800000, %eax       /* x is Inf or NaN?  */
 191         jae     L(arg_inf_or_nan)
 192
 193         /* Here if finite |x|>=9*Pi/4 */
 194         cmpl    $0x4b000000, %eax       /* |x|<2^23?  */
 195         jae     L(very_large_args)
 196
 197         /* Here if 9*Pi/4<=|x|<2^23 */
 198         movsd   MO1(DP_INVPIO4), %xmm1  /* 1/(Pi/4) */
 199         mulsd   %xmm0, %xmm1            /* |x|/(Pi/4) */
 200         cvttsd2si %xmm1, %eax           /* k=trunc(|x|/(Pi/4)) */
 201         addl    $1, %eax                /* k+1 */
 202         movl    %eax, %edx
 203         andl    $0xfffffffe, %edx       /* j=(k+1)&0xfffffffe */
 204         cvtsi2sdl %edx, %xmm4           /* DP j */
 205         movsd   MO1(DP_PIO4HI), %xmm2   /* -PIO4HI = high part of -Pi/4 */
 206         mulsd   %xmm4, %xmm2            /* -j*PIO4HI */
 207         movsd   MO1(DP_PIO4LO), %xmm3   /* -PIO4LO = low part of -Pi/4 */
 208         addsd   %xmm2, %xmm0            /* |x| - j*PIO4HI */
 209         addl    $2, %eax                /* n */
 210         mulsd   %xmm3, %xmm4            /* j*PIO4LO */
 211         addsd   %xmm4, %xmm0            /* t = |x| - j*PIO4HI - j*PIO4LO */
 212         jmp     L(reconstruction)
 213
 214         .p2align        4
 215 L(very_large_args):
 216         /* Here if finite |x|>=2^23 */
 217
 218         /* bitpos = (ix>>23) - BIAS_32 + 59; */
 219         shrl    $23, %eax               /* eb = biased exponent of x */
 220         /* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */
 221         subl    $68, %eax
 222         movl    $28, %ecx               /* %cl=28 */
 223         movl    %eax, %edx              /* bitpos copy */
 224
 225         /* j = bitpos/28; */
 226         div     %cl                     /* j in register %al=%ax/%cl */
 227         movapd  %xmm0, %xmm3            /* |x| */
 228         /* clear unneeded remainder from %ah */
 229         andl    $0xff, %eax
 230
 231         imull   $28, %eax, %ecx         /* j*28 */
 232         movsd   MO1(DP_HI_MASK), %xmm4  /* DP_HI_MASK */
 233         movapd  %xmm0, %xmm5            /* |x| */
 234         mulsd   -2*8+MO2(_FPI,%eax,8), %xmm3    /* tmp3 = FPI[j-2]*|x| */
 235         movapd  %xmm0, %xmm1            /* |x| */
 236         mulsd   -1*8+MO2(_FPI,%eax,8), %xmm5    /* tmp2 = FPI[j-1]*|x| */
 237         mulsd   0*8+MO2(_FPI,%eax,8), %xmm0     /* tmp0 = FPI[j]*|x| */
 238         addl    $19, %ecx               /* j*28+19 */
 239         mulsd   1*8+MO2(_FPI,%eax,8), %xmm1     /* tmp1 = FPI[j+1]*|x| */
 240         cmpl    %ecx, %edx              /* bitpos>=j*28+19?  */
 241         jl      L(very_large_skip1)
 242
 243         /* Here if bitpos>=j*28+19 */
 244         andpd   %xmm3, %xmm4            /* HI(tmp3) */
 245         subsd   %xmm4, %xmm3            /* tmp3 = tmp3 - HI(tmp3) */
 246 L(very_large_skip1):
 247
 248         movsd   MO1(DP_2POW52), %xmm6
 249         movapd  %xmm5, %xmm2            /* tmp2 copy */
 250         addsd   %xmm3, %xmm5            /* tmp5 = tmp3 + tmp2 */
 251         movl    $1, %edx
 252         addsd   %xmm5, %xmm6            /* tmp6 = tmp5 + 2^52 */
 253         movsd   8+MO1(DP_2POW52), %xmm4
 254         movd    %xmm6, %eax             /* k = I64_LO(tmp6); */
 255         addsd   %xmm6, %xmm4            /* tmp4 = tmp6 - 2^52 */
 256         comisd  %xmm5, %xmm4            /* tmp4 > tmp5?  */
 257         jbe     L(very_large_skip2)
 258
 259         /* Here if tmp4 > tmp5 */
 260         subl    $1, %eax                /* k-- */
 261         addsd   8+MO1(DP_ONES), %xmm4   /* tmp4 -= 1.0 */
 262 L(very_large_skip2):
 263
 264         andl    %eax, %edx              /* k&1 */
 265         subsd   %xmm4, %xmm3            /* tmp3 -= tmp4 */
 266         addsd   MO2(DP_ZERONE,%edx,8), %xmm3 /* t  = DP_ZERONE[k&1] + tmp3 */
 267         addsd   %xmm2, %xmm3            /* t += tmp2 */
 268         addsd   %xmm3, %xmm0            /* t += tmp0 */
 269         addl    $3, %eax                /* n=k+3 */
 270         addsd   %xmm1, %xmm0            /* t += tmp1 */
 271         mulsd   MO1(DP_PIO4), %xmm0     /* t *= PI04 */
 272
 273         jmp     L(reconstruction)       /* end of very_large_args peth */
 274
 275         .p2align        4
 276 L(arg_less_pio4):
 277         /* Here if |x|<Pi/4 */
 278         cmpl    $0x3d000000, %eax       /* |x|<2^-5?  */
 279         jl      L(arg_less_2pn5)
 280
 281         /* Here if 2^-5<=|x|<Pi/4 */
 282         mulsd   %xmm0, %xmm0            /* y=x^2 */
 283         movaps  %xmm0, %xmm1            /* y */
 284         mulsd   %xmm0, %xmm0            /* z=x^4 */
 285         movsd   MO1(DP_C4), %xmm3       /* C4 */
 286         mulsd   %xmm0, %xmm3            /* z*C4 */
 287         movsd   MO1(DP_C3), %xmm5       /* C3 */
 288         mulsd   %xmm0, %xmm5            /* z*C3 */
 289         addsd   MO1(DP_C2), %xmm3       /* C2+z*C4 */
 290         mulsd   %xmm0, %xmm3            /* z*(C2+z*C4) */
 291         addsd   MO1(DP_C1), %xmm5       /* C1+z*C3 */
 292         mulsd   %xmm0, %xmm5            /* z*(C1+z*C3) */
 293         addsd   MO1(DP_C0), %xmm3       /* C0+z*(C2+z*C4) */
 294         mulsd   %xmm1, %xmm3            /* y*(C0+z*(C2+z*C4)) */
 295         addsd   %xmm5, %xmm3            /* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
 296         /* 1.0 + y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */
 297         addsd   MO1(DP_ONES), %xmm3
 298         cvtsd2ss %xmm3, %xmm3           /* SP result */
 299
 300 L(epilogue):
 301         lea     -4(%esp), %esp          /* Borrow 4 bytes of stack frame */
 302         movss   %xmm3, 0(%esp)          /* Move result from sse...  */
 303         flds    0(%esp)                 /* ...to FPU.  */
 304         /* Return back 4 bytes of stack frame */
 305         lea     4(%esp), %esp
 306         RETURN
 307
 308         .p2align        4
 309 L(arg_less_2pn5):
 310         /* Here if |x|<2^-5 */
 311         cmpl    $0x32000000, %eax       /* |x|<2^-27?  */
 312         jl      L(arg_less_2pn27)
 313
 314         /* Here if 2^-27<=|x|<2^-5 */
 315         mulsd   %xmm0, %xmm0            /* DP x^2 */
 316         movsd   MO1(DP_COS2_1), %xmm3   /* DP DP_COS2_1 */
 317         mulsd   %xmm0, %xmm3            /* DP x^2*DP_COS2_1 */
 318         addsd   MO1(DP_COS2_0), %xmm3   /* DP DP_COS2_0+x^2*DP_COS2_1 */
 319         mulsd   %xmm0, %xmm3            /* DP x^2*DP_COS2_0+x^4*DP_COS2_1 */
 320         /* DP 1.0+x^2*DP_COS2_0+x^4*DP_COS2_1 */
 321         addsd   MO1(DP_ONES), %xmm3
 322         cvtsd2ss %xmm3, %xmm3           /* SP result */
 323         jmp     L(epilogue)
 324
 325         .p2align        4
 326 L(arg_less_2pn27):
 327         /* Here if |x|<2^-27 */
 328         movss   ARG_X, %xmm0            /* x */
 329         andps   MO1(SP_ABS_MASK),%xmm0  /* |x| */
 330         movss   MO1(SP_ONE), %xmm3      /* 1.0 */
 331         subss   %xmm0, %xmm3            /* result is 1.0-|x| */
 332         jmp     L(epilogue)
 333
 334         .p2align        4
 335 L(arg_inf_or_nan):
 336         /* Here if |x| is Inf or NAN */
 337         jne     L(skip_errno_setting)   /* in case of x is NaN */
 338
 339         /* Here if x is Inf. Set errno to EDOM.  */
 340         call    JUMPTARGET(__errno_location)
 341         movl    $EDOM, (%eax)
 342
 343         .p2align        4
 344 L(skip_errno_setting):
 345         /* Here if |x| is Inf or NAN. Continued.  */
 346         movss   ARG_X, %xmm3            /* load x */
 347         subss   %xmm3, %xmm3            /* Result is NaN */
 348         jmp     L(epilogue)
 349 END(__cosf_sse2)
 350
 351         .section .rodata, "a"
 352         .p2align 3
 353 L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */
 354         .long   0x00000000,0x00000000
 355         .long   0x54442d18,0x3fe921fb
 356         .long   0x54442d18,0x3ff921fb
 357         .long   0x7f3321d2,0x4002d97c
 358         .long   0x54442d18,0x400921fb
 359         .long   0x2955385e,0x400f6a7a
 360         .long   0x7f3321d2,0x4012d97c
 361         .long   0xe9bba775,0x4015fdbb
 362         .long   0x54442d18,0x401921fb
 363         .long   0xbeccb2bb,0x401c463a
 364         .long   0x2955385e,0x401f6a7a
 365         .type L(PIO4J), @object
 366         ASM_SIZE_DIRECTIVE(L(PIO4J))
 367
 368         .p2align 3
 369 L(_FPI): /* 4/Pi broken into sum of positive DP values */
 370         .long   0x00000000,0x00000000
 371         .long   0x6c000000,0x3ff45f30
 372         .long   0x2a000000,0x3e3c9c88
 373         .long   0xa8000000,0x3c54fe13
 374         .long   0xd0000000,0x3aaf47d4
 375         .long   0x6c000000,0x38fbb81b
 376         .long   0xe0000000,0x3714acc9
 377         .long   0x7c000000,0x3560e410
 378         .long   0x56000000,0x33bca2c7
 379         .long   0xac000000,0x31fbd778
 380         .long   0xe0000000,0x300b7246
 381         .long   0xe8000000,0x2e5d2126
 382         .long   0x48000000,0x2c970032
 383         .long   0xe8000000,0x2ad77504
 384         .long   0xe0000000,0x290921cf
 385         .long   0xb0000000,0x274deb1c
 386         .long   0xe0000000,0x25829a73
 387         .long   0xbe000000,0x23fd1046
 388         .long   0x10000000,0x2224baed
 389         .long   0x8e000000,0x20709d33
 390         .long   0x80000000,0x1e535a2f
 391         .long   0x64000000,0x1cef904e
 392         .long   0x30000000,0x1b0d6398
 393         .long   0x24000000,0x1964ce7d
 394         .long   0x16000000,0x17b908bf
 395         .type L(_FPI), @object
 396         ASM_SIZE_DIRECTIVE(L(_FPI))
 397
 398 /* Coefficients of polynomial
 399  for cos(x)~=1.0+x^2*DP_COS2_0+x^4*DP_COS2_1, |x|<2^-5.  */
 400         .p2align 3
 401 L(DP_COS2_0):
 402         .long   0xff5cc6fd,0xbfdfffff
 403         .type L(DP_COS2_0), @object
 404         ASM_SIZE_DIRECTIVE(L(DP_COS2_0))
 405
 406         .p2align 3
 407 L(DP_COS2_1):
 408         .long   0xb178dac5,0x3fa55514
 409         .type L(DP_COS2_1), @object
 410         ASM_SIZE_DIRECTIVE(L(DP_COS2_1))
 411
 412         .p2align 3
 413 L(DP_ZERONE):
 414         .long   0x00000000,0x00000000   /* 0.0 */
 415         .long   0x00000000,0xbff00000   /* 1.0 */
 416         .type L(DP_ZERONE),@object
 417         ASM_SIZE_DIRECTIVE(L(DP_ZERONE))
 418
 419         .p2align 3
 420 L(DP_ONES):
 421         .long   0x00000000,0x3ff00000   /* +1.0 */
 422         .long   0x00000000,0xbff00000   /* -1.0 */
 423         .type L(DP_ONES), @object
 424         ASM_SIZE_DIRECTIVE(L(DP_ONES))
 425
 426 /* Coefficients of polynomial
 427  for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4.  */
 428         .p2align 3
 429 L(DP_S3):
 430         .long   0x64e6b5b4,0x3ec71d72
 431         .type L(DP_S3), @object
 432         ASM_SIZE_DIRECTIVE(L(DP_S3))
 433
 434         .p2align 3
 435 L(DP_S1):
 436         .long   0x10c2688b,0x3f811111
 437         .type L(DP_S1), @object
 438         ASM_SIZE_DIRECTIVE(L(DP_S1))
 439
 440         .p2align 3
 441 L(DP_S4):
 442         .long   0x1674b58a,0xbe5a947e
 443         .type L(DP_S4), @object
 444         ASM_SIZE_DIRECTIVE(L(DP_S4))
 445
 446         .p2align 3
 447 L(DP_S2):
 448         .long   0x8b4bd1f9,0xbf2a019f
 449         .type L(DP_S2), @object
 450         ASM_SIZE_DIRECTIVE(L(DP_S2))
 451
 452         .p2align 3
 453 L(DP_S0):
 454         .long   0x55551cd9,0xbfc55555
 455         .type L(DP_S0), @object
 456         ASM_SIZE_DIRECTIVE(L(DP_S0))
 457
 458 /* Coefficients of polynomial
 459  for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4.  */
 460         .p2align 3
 461 L(DP_C3):
 462         .long   0x9ac43cc0,0x3efa00eb
 463         .type L(DP_C3), @object
 464         ASM_SIZE_DIRECTIVE(L(DP_C3))
 465
 466         .p2align 3
 467 L(DP_C1):
 468         .long   0x545c50c7,0x3fa55555
 469         .type L(DP_C1), @object
 470         ASM_SIZE_DIRECTIVE(L(DP_C1))
 471
 472         .p2align 3
 473 L(DP_C4):
 474         .long   0xdd8844d7,0xbe923c97
 475         .type L(DP_C4), @object
 476         ASM_SIZE_DIRECTIVE(L(DP_C4))
 477
 478         .p2align 3
 479 L(DP_C2):
 480         .long   0x348b6874,0xbf56c16b
 481         .type L(DP_C2), @object
 482         ASM_SIZE_DIRECTIVE(L(DP_C2))
 483
 484         .p2align 3
 485 L(DP_C0):
 486         .long   0xfffe98ae,0xbfdfffff
 487         .type L(DP_C0), @object
 488         ASM_SIZE_DIRECTIVE(L(DP_C0))
 489
 490         .p2align 3
 491 L(DP_PIO4):
 492         .long   0x54442d18,0x3fe921fb   /* Pi/4 */
 493         .type L(DP_PIO4), @object
 494         ASM_SIZE_DIRECTIVE(L(DP_PIO4))
 495
 496         .p2align 3
 497 L(DP_2POW52):
 498         .long   0x00000000,0x43300000   /* +2^52 */
 499         .long   0x00000000,0xc3300000   /* -2^52 */
 500         .type L(DP_2POW52), @object
 501         ASM_SIZE_DIRECTIVE(L(DP_2POW52))
 502
 503         .p2align 3
 504 L(DP_INVPIO4):
 505         .long   0x6dc9c883,0x3ff45f30   /* 4/Pi */
 506         .type L(DP_INVPIO4), @object
 507         ASM_SIZE_DIRECTIVE(L(DP_INVPIO4))
 508
 509         .p2align 3
 510 L(DP_PIO4HI):
 511         .long   0x54000000,0xbfe921fb   /* High part of Pi/4 */
 512         .type L(DP_PIO4HI), @object
 513         ASM_SIZE_DIRECTIVE(L(DP_PIO4HI))
 514
 515         .p2align 3
 516 L(DP_PIO4LO):
 517         .long   0x11A62633,0xbe010b46   /* Low part of Pi/4 */
 518         .type L(DP_PIO4LO), @object
 519         ASM_SIZE_DIRECTIVE(L(DP_PIO4LO))
 520
 521         .p2align 2
 522 L(SP_INVPIO4):
 523         .long   0x3fa2f983              /* 4/Pi */
 524         .type L(SP_INVPIO4), @object
 525         ASM_SIZE_DIRECTIVE(L(SP_INVPIO4))
 526
 527         .p2align 4
 528 L(DP_ABS_MASK): /* Mask for getting DP absolute value */
 529         .long   0xffffffff,0x7fffffff
 530         .long   0xffffffff,0x7fffffff
 531         .type L(DP_ABS_MASK), @object
 532         ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK))
 533
 534         .p2align 3
 535 L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */
 536         .long   0x00000000,0xffffffff
 537         .type L(DP_HI_MASK), @object
 538         ASM_SIZE_DIRECTIVE(L(DP_HI_MASK))
 539
 540         .p2align 4
 541 L(SP_ABS_MASK): /* Mask for getting SP absolute value */
 542         .long   0x7fffffff,0x7fffffff
 543         .long   0x7fffffff,0x7fffffff
 544         .type L(SP_ABS_MASK), @object
 545         ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK))
 546
 547         .p2align 2
 548 L(SP_ONE):
 549         .long   0x3f800000              /* 1.0 */
 550         .type L(SP_ONE), @object
 551         ASM_SIZE_DIRECTIVE(L(SP_ONE))
 552
 553 weak_alias (__cosf, cosf)