4 // Copyright (c) 2000 - 2003, Intel Corporation
5 // All rights reserved.
8 // Redistribution and use in source and binary forms, with or without
9 // modification, are permitted provided that the following conditions are
12 // * Redistributions of source code must retain the above copyright
13 // notice, this list of conditions and the following disclaimer.
15 // * Redistributions in binary form must reproduce the above copyright
16 // notice, this list of conditions and the following disclaimer in the
17 // documentation and/or other materials provided with the distribution.
19 // * The name of Intel Corporation may not be used to endorse or promote
20 // products derived from this software without specific prior written
23 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 // Intel Corporation is the author of this code, and requests that all
36 // problem reports or change requests be submitted to it directly at
37 // http://www.intel.com/software/products/opensource/libraries/num.htm.
39 //*********************************************************************
42 // 09/04/01 Initial version
43 // 09/13/01 Performance improved, symmetry problems fixed
44 // 10/10/01 Performance improved, split issues removed
45 // 12/11/01 Changed huges_logp to not be global
46 // 05/20/02 Cleaned up namespace and sf0 syntax
47 // 02/10/03 Reordered header: .section, .global, .proc, .align;
48 // used data8 for long double table values
50 //*********************************************************************
53 //==============================================================
54 // long double asinhl(long double);
56 // Overview of operation
57 //==============================================================
60 // 1. x = 0, [S,Q]Nan or +/-INF
61 // Return asinhl(x) = x + x;
64 // Return asinhl(x) = x - x^2;
67 // Return asinhl(x) = x + x^2;
69 // 4. 'Near 0': max denormal < |x| < 1/128
70 // Return asinhl(x) = sign(x)*(x+x^3*(c3+x^2*(c5+x^2*(c7+x^2*(c9)))));
72 // 5. 'Huges': |x| > 2^63
73 // Return asinhl(x) = sign(x)*(logl(2*x));
75 // 6. 'Main path': 1/128 < |x| < 2^63
76 // b_hi + b_lo = x + sqrt(x^2 + 1);
77 // asinhl(x) = sign(x)*(log_special(b_hi, b_lo));
79 // Algorithm description
80 //==============================================================
82 // Main path algorithm
83 // ( thanks to Peter Markstein for the idea of sqrt(x^2+1) computation! )
84 // *************************************************************************
86 // There are 3 parts of x+sqrt(x^2+1) computation:
88 // 1) p2 = (p2_hi+p2_lo) = x^2+1 obtaining
89 // ------------------------------------
90 // p2_hi = x2_hi + 1, where x2_hi = x * x;
91 // p2_lo = x2_lo + p1_lo, where
92 // x2_lo = FMS(x*x-x2_hi),
93 // p1_lo = (1 - p2_hi) + x2_hi;
95 // 2) g = (g_hi+g_lo) = sqrt(p2) = sqrt(p2_hi+p2_lo)
96 // ----------------------------------------------
97 // r = invsqrt(p2_hi) (8-bit reciprocal square root approximation);
98 // g = p2_hi * r (first 8 bit-approximation of sqrt);
102 // g = g * e + g (second 16 bit-approximation of sqrt);
106 // g = g * e + g (third 32 bit-approximation of sqrt);
110 // g_hi = g * e + g (fourth 64 bit-approximation of sqrt);
112 // Remainder computation:
114 // d = (p2_hi - g_hi * g_hi) + p2_lo;
117 // 3) b = (b_hi + b_lo) = x + g, where g = (g_hi + g_lo) = sqrt(x^2+1)
118 // -------------------------------------------------------------------
119 // b_hi = (g_hi + x) + gl;
120 // b_lo = (g_hi - b_hi) + x + gl;
122 // Now we pass b presented as sum b_hi + b_lo to special version
123 // of logl function which accept a pair of arguments as
124 // 'mutiprecision' value.
126 // Special log algorithm overview
127 // ================================
128 // Here we use a table lookup method. The basic idea is that in
129 // order to compute logl(Arg) = logl (Arg-1) for an argument Arg in [1,2),
130 // we construct a value G such that G*Arg is close to 1 and that
131 // logl(1/G) is obtainable easily from a table of values calculated
134 // logl(Arg) = logl(1/G) + logl((G*Arg - 1))
136 // Because |G*Arg - 1| is small, the second term on the right hand
137 // side can be approximated by a short polynomial. We elaborate
138 // this method in four steps.
140 // Step 0: Initialization
142 // We need to calculate logl( X ). Obtain N, S_hi such that
144 // X = 2^N * ( S_hi + S_lo ) exactly
146 // where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
147 // that |S_lo| <= ulp(S_hi).
149 // For the special version of logl: S_lo = b_lo
150 // !-----------------------------------------------!
152 // Step 1: Argument Reduction
154 // Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
156 // G := G_1 * G_2 * G_3
157 // r := (G * S_hi - 1) + G * S_lo
159 // These G_j's have the property that the product is exactly
160 // representable and that |r| < 2^(-12) as a result.
162 // Step 2: Approximation
164 // logl(1 + r) is approximated by a short polynomial poly(r).
166 // Step 3: Reconstruction
170 // logl( X ) = logl( 2^N * (S_hi + S_lo) )
171 // ~=~ N*logl(2) + logl(1/G) + logl(1 + r)
172 // ~=~ N*logl(2) + logl(1/G) + poly(r).
174 // For detailed description see logl or log1pl function, regular path.
177 //==============================================================
178 // Floating Point registers used:
180 // f32 -> f101 (70 registers)
182 // General registers used:
183 // r32 -> r57 (26 registers)
185 // Predicate registers used:
187 // p6 for '0, NaNs, Inf' path
188 // p7 for '+ denormals' path
189 // p8 for 'near 0' path
190 // p9 for 'huges' path
191 // p10 for '- denormals' path
192 // p11 for negative values
195 //==============================================================
200 // C7, C9 'near 0' polynomial coefficients
201 LOCAL_OBJECT_START(Poly_C_near_0_79)
202 data8 0xF8DC939BBEDD5A54, 0x00003FF9
203 data8 0xB6DB6DAB21565AC5, 0x0000BFFA
204 LOCAL_OBJECT_END(Poly_C_near_0_79)
206 // C3, C5 'near 0' polynomial coefficients
207 LOCAL_OBJECT_START(Poly_C_near_0_35)
208 data8 0x999999999991D582, 0x00003FFB
209 data8 0xAAAAAAAAAAAAAAA9, 0x0000BFFC
210 LOCAL_OBJECT_END(Poly_C_near_0_35)
213 LOCAL_OBJECT_START(Constants_Q)
214 data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
215 data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
216 data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
217 data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
218 data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
219 data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
220 LOCAL_OBJECT_END(Constants_Q)
223 LOCAL_OBJECT_START(Constants_Z_1)
240 LOCAL_OBJECT_END(Constants_Z_1)
242 // G1 and H1 - IEEE single and h1 - IEEE double
243 LOCAL_OBJECT_START(Constants_G_H_h1)
244 data4 0x3F800000,0x00000000
245 data8 0x0000000000000000
246 data4 0x3F70F0F0,0x3D785196
247 data8 0x3DA163A6617D741C
248 data4 0x3F638E38,0x3DF13843
249 data8 0x3E2C55E6CBD3D5BB
250 data4 0x3F579430,0x3E2FF9A0
251 data8 0xBE3EB0BFD86EA5E7
252 data4 0x3F4CCCC8,0x3E647FD6
253 data8 0x3E2E6A8C86B12760
254 data4 0x3F430C30,0x3E8B3AE7
255 data8 0x3E47574C5C0739BA
256 data4 0x3F3A2E88,0x3EA30C68
257 data8 0x3E20E30F13E8AF2F
258 data4 0x3F321640,0x3EB9CEC8
259 data8 0xBE42885BF2C630BD
260 data4 0x3F2AAAA8,0x3ECF9927
261 data8 0x3E497F3497E577C6
262 data4 0x3F23D708,0x3EE47FC5
263 data8 0x3E3E6A6EA6B0A5AB
264 data4 0x3F1D89D8,0x3EF8947D
265 data8 0xBDF43E3CD328D9BE
266 data4 0x3F17B420,0x3F05F3A1
267 data8 0x3E4094C30ADB090A
268 data4 0x3F124920,0x3F0F4303
269 data8 0xBE28FBB2FC1FE510
270 data4 0x3F0D3DC8,0x3F183EBF
271 data8 0x3E3A789510FDE3FA
272 data4 0x3F088888,0x3F20EC80
273 data8 0x3E508CE57CC8C98F
274 data4 0x3F042108,0x3F29516A
275 data8 0xBE534874A223106C
276 LOCAL_OBJECT_END(Constants_G_H_h1)
279 LOCAL_OBJECT_START(Constants_Z_2)
296 LOCAL_OBJECT_END(Constants_Z_2)
298 // G2 and H2 - IEEE single and h2 - IEEE double
299 LOCAL_OBJECT_START(Constants_G_H_h2)
300 data4 0x3F800000,0x00000000
301 data8 0x0000000000000000
302 data4 0x3F7F00F8,0x3B7F875D
303 data8 0x3DB5A11622C42273
304 data4 0x3F7E03F8,0x3BFF015B
305 data8 0x3DE620CF21F86ED3
306 data4 0x3F7D08E0,0x3C3EE393
307 data8 0xBDAFA07E484F34ED
308 data4 0x3F7C0FC0,0x3C7E0586
309 data8 0xBDFE07F03860BCF6
310 data4 0x3F7B1880,0x3C9E75D2
311 data8 0x3DEA370FA78093D6
312 data4 0x3F7A2328,0x3CBDC97A
313 data8 0x3DFF579172A753D0
314 data4 0x3F792FB0,0x3CDCFE47
315 data8 0x3DFEBE6CA7EF896B
316 data4 0x3F783E08,0x3CFC15D0
317 data8 0x3E0CF156409ECB43
318 data4 0x3F774E38,0x3D0D874D
319 data8 0xBE0B6F97FFEF71DF
320 data4 0x3F766038,0x3D1CF49B
321 data8 0xBE0804835D59EEE8
322 data4 0x3F757400,0x3D2C531D
323 data8 0x3E1F91E9A9192A74
324 data4 0x3F748988,0x3D3BA322
325 data8 0xBE139A06BF72A8CD
326 data4 0x3F73A0D0,0x3D4AE46F
327 data8 0x3E1D9202F8FBA6CF
328 data4 0x3F72B9D0,0x3D5A1756
329 data8 0xBE1DCCC4BA796223
330 data4 0x3F71D488,0x3D693B9D
331 data8 0xBE049391B6B7C239
332 LOCAL_OBJECT_END(Constants_G_H_h2)
334 // G3 and H3 - IEEE single and h3 - IEEE double
335 LOCAL_OBJECT_START(Constants_G_H_h3)
336 data4 0x3F7FFC00,0x38800100
337 data8 0x3D355595562224CD
338 data4 0x3F7FF400,0x39400480
339 data8 0x3D8200A206136FF6
340 data4 0x3F7FEC00,0x39A00640
341 data8 0x3DA4D68DE8DE9AF0
342 data4 0x3F7FE400,0x39E00C41
343 data8 0xBD8B4291B10238DC
344 data4 0x3F7FDC00,0x3A100A21
345 data8 0xBD89CCB83B1952CA
346 data4 0x3F7FD400,0x3A300F22
347 data8 0xBDB107071DC46826
348 data4 0x3F7FCC08,0x3A4FF51C
349 data8 0x3DB6FCB9F43307DB
350 data4 0x3F7FC408,0x3A6FFC1D
351 data8 0xBD9B7C4762DC7872
352 data4 0x3F7FBC10,0x3A87F20B
353 data8 0xBDC3725E3F89154A
354 data4 0x3F7FB410,0x3A97F68B
355 data8 0xBD93519D62B9D392
356 data4 0x3F7FAC18,0x3AA7EB86
357 data8 0x3DC184410F21BD9D
358 data4 0x3F7FA420,0x3AB7E101
359 data8 0xBDA64B952245E0A6
360 data4 0x3F7F9C20,0x3AC7E701
361 data8 0x3DB4B0ECAABB34B8
362 data4 0x3F7F9428,0x3AD7DD7B
363 data8 0x3D9923376DC40A7E
364 data4 0x3F7F8C30,0x3AE7D474
365 data8 0x3DC6E17B4F2083D3
366 data4 0x3F7F8438,0x3AF7CBED
367 data8 0x3DAE314B811D4394
368 data4 0x3F7F7C40,0x3B03E1F3
369 data8 0xBDD46F21B08F2DB1
370 data4 0x3F7F7448,0x3B0BDE2F
371 data8 0xBDDC30A46D34522B
372 data4 0x3F7F6C50,0x3B13DAAA
373 data8 0x3DCB0070B1F473DB
374 data4 0x3F7F6458,0x3B1BD766
375 data8 0xBDD65DDC6AD282FD
376 data4 0x3F7F5C68,0x3B23CC5C
377 data8 0xBDCDAB83F153761A
378 data4 0x3F7F5470,0x3B2BC997
379 data8 0xBDDADA40341D0F8F
380 data4 0x3F7F4C78,0x3B33C711
381 data8 0x3DCD1BD7EBC394E8
382 data4 0x3F7F4488,0x3B3BBCC6
383 data8 0xBDC3532B52E3E695
384 data4 0x3F7F3C90,0x3B43BAC0
385 data8 0xBDA3961EE846B3DE
386 data4 0x3F7F34A0,0x3B4BB0F4
387 data8 0xBDDADF06785778D4
388 data4 0x3F7F2CA8,0x3B53AF6D
389 data8 0x3DCC3ED1E55CE212
390 data4 0x3F7F24B8,0x3B5BA620
391 data8 0xBDBA31039E382C15
392 data4 0x3F7F1CC8,0x3B639D12
393 data8 0x3D635A0B5C5AF197
394 data4 0x3F7F14D8,0x3B6B9444
395 data8 0xBDDCCB1971D34EFC
396 data4 0x3F7F0CE0,0x3B7393BC
397 data8 0x3DC7450252CD7ADA
398 data4 0x3F7F04F0,0x3B7B8B6D
399 data8 0xBDB68F177D7F2A42
400 LOCAL_OBJECT_END(Constants_G_H_h3)
403 //==============================================================
405 // Floating Point Registers
413 // Special logl registers
465 FR_2_to_minus_N = f76
468 // Huge & Main path prolog registers
486 // Near 0 & Huges path prolog registers
499 // General Purpose Registers
501 // General prolog registers
509 // Near 0 path prolog registers
513 // Special logl registers
542 GLOBAL_LIBM_ENTRY(asinhl)
545 alloc GR_PFS = ar.pfs,0,27,0,0
546 fma.s1 FR_P2 = FR_Arg, FR_Arg, f1 // p2 = x^2 + 1
547 mov GR_Half = 0xfffe // 0.5's exp
550 addl GR_Poly_C_79 = @ltoff(Poly_C_near_0_79), gp // C7, C9 coeffs
551 fma.s1 FR_X2 = FR_Arg, FR_Arg, f0 // Obtain x^2
552 addl GR_Poly_C_35 = @ltoff(Poly_C_near_0_35), gp // C3, C5 coeffs
556 getf.exp GR_ArgExp = FR_Arg // get arument's exponent
557 fabs FR_AX = FR_Arg // absolute value of argument
558 mov GR_TwoN7 = 0xfff8 // 2^-7 exp
561 ld8 GR_Poly_C_79 = [GR_Poly_C_79] // get actual coeff table address
562 fma.s0 FR_Two = f1, f1, f1 // construct 2.0
563 mov GR_ExpMask = 0x1ffff // mask for exp
567 ld8 GR_Poly_C_35 = [GR_Poly_C_35] // get actual coeff table address
568 fclass.m p6,p0 = FR_Arg, 0xe7 // if arg NaN inf zero
569 mov GR_TwoP63 = 0x1003e // 2^63 exp
572 addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp
578 setf.exp FR_Half = GR_Half // construct 0.5
579 fclass.m p7,p0 = FR_Arg, 0x09 // if arg + denorm
580 and GR_ArgExp = GR_ExpMask, GR_ArgExp // select exp
583 ld8 GR_ad_z_1 = [GR_ad_z_1] // Get pointer to Constants_Z_1
588 ldfe FR_C9 = [GR_Poly_C_79],16 // load C9
589 fclass.m p10,p0 = FR_Arg, 0x0a // if arg - denorm
590 cmp.gt p8, p0 = GR_TwoN7, GR_ArgExp // if arg < 2^-7 ('near 0')
593 cmp.le p9, p0 = GR_TwoP63, GR_ArgExp // if arg > 2^63 ('huges')
594 (p6) fma.s0 FR_Res = FR_Arg,f1,FR_Arg // r = a + a
595 (p6) br.ret.spnt b0 // return
597 // (X^2 + 1) computation
599 (p8) ldfe FR_C5 = [GR_Poly_C_35],16 // load C5
600 fms.s1 FR_Tmp = f1, f1, FR_P2 // Tmp = 1 - p2
601 add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1
604 (p8) ldfe FR_C7 = [GR_Poly_C_79],16 // load C7
605 (p7) fnma.s0 FR_Res = FR_Arg,FR_Arg,FR_Arg // r = a - a*a
606 (p7) br.ret.spnt b0 // return
610 (p8) ldfe FR_C3 = [GR_Poly_C_35],16 // load C3
611 fcmp.lt.s1 p11, p12 = FR_Arg, f0 // if arg is negative
612 add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P
615 add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
616 (p10) fma.s0 FR_Res = FR_Arg,FR_Arg,FR_Arg // r = a + a*a
617 (p10) br.ret.spnt b0 // return
621 add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2
622 frsqrta.s1 FR_Rcp, p0 = FR_P2 // Rcp = 1/p2 reciprocal appr.
623 add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3
627 fms.s1 FR_P2L = FR_AX, FR_AX, FR_X2 //low part of p2=fma(X*X-p2)
628 mov GR_Bias = 0x0FFFF // Create exponent bias
633 (p9) fms.s1 FR_XLog_Hi = FR_Two, FR_AX, f0 // Hi of log1p arg = 2*X - 1
634 (p9) br.cond.spnt huges_logl // special version of log1p
638 ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
639 (p8) fma.s1 FR_X3 = FR_X2, FR_Arg, f0 // x^3 = x^2 * x
640 (p8) br.cond.spnt near_0 // Go to near 0 branch
644 ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
650 ldfe FR_Q4 = [GR_ad_q],16 // Load Q4
651 fma.s1 FR_Tmp = FR_Tmp, f1, FR_X2 // Tmp = Tmp + x^2
652 mov GR_exp_mask = 0x1FFFF // Create exponent mask
656 ldfe FR_Q3 = [GR_ad_q],16 // Load Q3
657 fma.s1 FR_GG = FR_Rcp, FR_P2, f0 // g = Rcp * p2
658 // 8 bit Newton Raphson iteration
663 fma.s1 FR_HH = FR_Half, FR_Rcp, f0 // h = 0.5 * Rcp
667 ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
668 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
673 fma.s1 FR_P2L = FR_Tmp, f1, FR_P2L // low part of p2 = Tmp + p2l
678 ldfe FR_Q1 = [GR_ad_q] // Load Q1
679 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
680 // 16 bit Newton Raphson iteration
685 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
691 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
697 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
698 // 32 bit Newton Raphson iteration
703 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
709 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
715 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
716 // 64 bit Newton Raphson iteration
721 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
727 fnma.s1 FR_DD = FR_GG, FR_GG, FR_P2 // Remainder d = g * g - p2
732 fma.s1 FR_XLog_Hi = FR_AX, f1, FR_GG // bh = z + gh
738 fma.s1 FR_DD = FR_DD, f1, FR_P2L // add p2l: d = d + p2l
744 getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1
745 fmerge.ns FR_Neg_One = f1, f1 // Form -1.0
746 mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7
751 fma.s1 FR_GL = FR_DD, FR_HH, f0 // gl = d * h
752 extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif
756 fma.s1 FR_XLog_Hi = FR_DD, FR_HH, FR_XLog_Hi // bh = bh + gl
761 shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
762 shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1
763 extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif.
767 ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
773 ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
780 fms.s1 FR_XLog_Lo = FR_GG, f1, FR_XLog_Hi // bl = gh - bh
781 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1
784 // WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
795 fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x+1|
800 getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1
801 ldfd FR_h = [GR_ad_tbl_1] // Load h_1
808 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
813 shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2
814 fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_AX // bl = bl + x
815 mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80
818 shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
820 sub GR_N = GR_N, GR_Bias // sub bias from exp
824 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
825 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
826 sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N)
830 ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
836 setf.sig FR_float_N = GR_N // Put integer N into rightmost sign
837 setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N)
838 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
841 // WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
842 // BECAUSE OF POSSIBLE 10 CLOCKS STALL!
843 // So we can negate Q coefficients there for negative values
847 (p11) fma.s1 FR_Q1 = FR_Q1, FR_Neg_One, f0 // Negate Q1
852 fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_GL // bl = bl + gl
858 (p11) fma.s1 FR_Q2 = FR_Q2, FR_Neg_One, f0 // Negate Q2
864 (p11) fma.s1 FR_Q3 = FR_Q3, FR_Neg_One, f0 // Negate Q3
870 (p11) fma.s1 FR_Q4 = FR_Q4, FR_Neg_One, f0 // Negate Q4
871 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
875 shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3
881 ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
887 ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
888 fcvt.xf FR_float_N = FR_float_N
894 fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
899 fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
905 fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
910 fma.s1 FR_S_lo = FR_XLog_Lo, FR_2_to_minus_N, f0 //S_lo=S_lo*2^-N
916 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
921 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
927 fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
933 fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1
938 fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
944 fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h=N*log2_lo+h
949 fma.s1 FR_r = FR_G, FR_S_lo, FR_r // r=G*S_lo+(G*S_hi-1)
955 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3
960 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
966 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
971 fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
975 .pred.rel "mutex",p12,p11
978 (p12) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
983 (p11) fms.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
988 .pred.rel "mutex",p12,p11
991 (p12) fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
996 (p11) fms.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1003 fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo
1004 // Y_lo=poly_hi+poly_lo
1009 (p11) fma.s0 FR_Y_hi = FR_Y_hi, FR_Neg_One, f0 // FR_Y_hi sign for neg
1015 fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi
1016 br.ret.sptk b0 // Common exit for 2^-7 < x < inf
1019 // * SPECIAL VERSION OF LOGL FOR HUGE ARGUMENTS *
1023 getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1
1024 fmerge.ns FR_Neg_One = f1, f1 // Form -1.0
1025 mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7
1029 add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1
1031 add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P
1034 add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
1036 add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2
1042 extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif
1045 add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3
1051 shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
1053 extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif.
1057 ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
1059 mov GR_exp_mask = 0x1FFFF // Create exponent mask
1062 shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1
1064 mov GR_Bias = 0x0FFFF // Create exponent bias
1068 ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
1069 fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x+1|
1074 getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1
1075 ldfd FR_h = [GR_ad_tbl_1] // Load h_1
1080 ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
1082 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1
1085 // WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
1089 ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
1090 sub GR_N = GR_N, GR_Bias
1091 mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80
1095 ldfe FR_Q4 = [GR_ad_q],16 // Load Q4
1097 sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N)
1101 ldfe FR_Q3 = [GR_ad_q],16 // Load Q3
1102 setf.sig FR_float_N = GR_N // Put integer N into rightmost sign
1108 ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
1109 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
1113 ldfe FR_Q1 = [GR_ad_q] // Load Q1
1114 shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
1119 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
1120 shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2
1125 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
1131 ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
1136 setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N)
1144 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
1147 // WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
1149 // JUST HAVE TO INSERT 3 NOP CYCLES (nothing to do here)
1171 (p11) fma.s1 FR_Q4 = FR_Q4, FR_Neg_One, f0 // Negate Q4
1172 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
1176 shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3
1177 fcvt.xf FR_float_N = FR_float_N
1182 (p11) fma.s1 FR_Q3 = FR_Q3, FR_Neg_One, f0 // Negate Q3
1187 ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
1188 (p11) fma.s1 FR_Q2 = FR_Q2, FR_Neg_One, f0 // Negate Q2
1193 (p11) fma.s1 FR_Q1 = FR_Q1, FR_Neg_One, f0 // Negate Q1
1198 ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
1199 fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
1204 fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
1211 fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
1216 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
1221 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
1227 fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
1233 fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1
1238 fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
1244 fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h=N*log2_lo+h
1250 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3
1255 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
1261 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
1266 fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
1270 .pred.rel "mutex",p12,p11
1273 (p12) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1278 (p11) fms.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1283 .pred.rel "mutex",p12,p11
1286 (p12) fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1291 (p11) fms.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1297 fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo=poly_hi+poly_lo
1302 (p11) fma.s0 FR_Y_hi = FR_Y_hi, FR_Neg_One, f0 // FR_Y_hi sign for neg
1308 fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi
1309 br.ret.sptk b0 // Common exit for 2^-7 < x < inf
1312 // NEAR ZERO POLYNOMIAL INTERVAL
1316 fma.s1 FR_X4 = FR_X2, FR_X2, f0 // x^4 = x^2 * x^2
1322 fma.s1 FR_P9 = FR_C9,FR_X2,FR_C7 // p9 = C9*x^2 + C7
1327 fma.s1 FR_P5 = FR_C5,FR_X2,FR_C3 // p5 = C5*x^2 + C3
1333 fma.s1 FR_P3 = FR_P9,FR_X4,FR_P5 // p3 = p9*x^4 + p5
1339 fma.s0 FR_Res = FR_P3,FR_X3,FR_Arg // res = p3*C3 + x
1340 br.ret.sptk b0 // Near 0 path return
1343 GLOBAL_LIBM_END(asinhl)
1344 libm_alias_ldouble_other (asinh, asinh)