4 // Copyright (c) 2000 - 2003, Intel Corporation
5 // All rights reserved.
7 // Contributed 2000 by the Intel Numerics Group, Intel Corporation
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
13 // * Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
16 // * Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
20 // * The name of Intel Corporation may not be used to endorse or promote
21 // products derived from this software without specific prior written
24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 // Intel Corporation is the author of this code, and requests that all
37 // problem reports or change requests be submitted to it directly at
38 // http://www.intel.com/software/products/opensource/libraries/num.htm.
40 //*********************************************************************
43 // 09/04/01 Initial version
44 // 09/13/01 Performance improved, symmetry problems fixed
45 // 10/10/01 Performance improved, split issues removed
46 // 12/11/01 Changed huges_logp to not be global
47 // 05/20/02 Cleaned up namespace and sf0 syntax
48 // 02/10/03 Reordered header: .section, .global, .proc, .align;
49 // used data8 for long double table values
51 //*********************************************************************
54 //==============================================================
55 // long double asinhl(long double);
57 // Overview of operation
58 //==============================================================
61 // 1. x = 0, [S,Q]Nan or +/-INF
62 // Return asinhl(x) = x + x;
65 // Return asinhl(x) = x - x^2;
68 // Return asinhl(x) = x + x^2;
70 // 4. 'Near 0': max denormal < |x| < 1/128
71 // Return asinhl(x) = sign(x)*(x+x^3*(c3+x^2*(c5+x^2*(c7+x^2*(c9)))));
73 // 5. 'Huges': |x| > 2^63
74 // Return asinhl(x) = sign(x)*(logl(2*x));
76 // 6. 'Main path': 1/128 < |x| < 2^63
77 // b_hi + b_lo = x + sqrt(x^2 + 1);
78 // asinhl(x) = sign(x)*(log_special(b_hi, b_lo));
80 // Algorithm description
81 //==============================================================
83 // Main path algorithm
84 // ( thanks to Peter Markstein for the idea of sqrt(x^2+1) computation! )
85 // *************************************************************************
87 // There are 3 parts of x+sqrt(x^2+1) computation:
89 // 1) p2 = (p2_hi+p2_lo) = x^2+1 obtaining
90 // ------------------------------------
91 // p2_hi = x2_hi + 1, where x2_hi = x * x;
92 // p2_lo = x2_lo + p1_lo, where
93 // x2_lo = FMS(x*x-x2_hi),
94 // p1_lo = (1 - p2_hi) + x2_hi;
96 // 2) g = (g_hi+g_lo) = sqrt(p2) = sqrt(p2_hi+p2_lo)
97 // ----------------------------------------------
98 // r = invsqrt(p2_hi) (8-bit reciprocal square root approximation);
99 // g = p2_hi * r (first 8 bit-approximation of sqrt);
103 // g = g * e + g (second 16 bit-approximation of sqrt);
107 // g = g * e + g (third 32 bit-approximation of sqrt);
111 // g_hi = g * e + g (fourth 64 bit-approximation of sqrt);
113 // Remainder computation:
115 // d = (p2_hi - g_hi * g_hi) + p2_lo;
118 // 3) b = (b_hi + b_lo) = x + g, where g = (g_hi + g_lo) = sqrt(x^2+1)
119 // -------------------------------------------------------------------
120 // b_hi = (g_hi + x) + gl;
121 // b_lo = (g_hi - b_hi) + x + gl;
123 // Now we pass b presented as sum b_hi + b_lo to special version
124 // of logl function which accept a pair of arguments as
125 // 'mutiprecision' value.
127 // Special log algorithm overview
128 // ================================
129 // Here we use a table lookup method. The basic idea is that in
130 // order to compute logl(Arg) = logl (Arg-1) for an argument Arg in [1,2),
131 // we construct a value G such that G*Arg is close to 1 and that
132 // logl(1/G) is obtainable easily from a table of values calculated
135 // logl(Arg) = logl(1/G) + logl((G*Arg - 1))
137 // Because |G*Arg - 1| is small, the second term on the right hand
138 // side can be approximated by a short polynomial. We elaborate
139 // this method in four steps.
141 // Step 0: Initialization
143 // We need to calculate logl( X ). Obtain N, S_hi such that
145 // X = 2^N * ( S_hi + S_lo ) exactly
147 // where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
148 // that |S_lo| <= ulp(S_hi).
150 // For the special version of logl: S_lo = b_lo
151 // !-----------------------------------------------!
153 // Step 1: Argument Reduction
155 // Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
157 // G := G_1 * G_2 * G_3
158 // r := (G * S_hi - 1) + G * S_lo
160 // These G_j's have the property that the product is exactly
161 // representable and that |r| < 2^(-12) as a result.
163 // Step 2: Approximation
165 // logl(1 + r) is approximated by a short polynomial poly(r).
167 // Step 3: Reconstruction
171 // logl( X ) = logl( 2^N * (S_hi + S_lo) )
172 // ~=~ N*logl(2) + logl(1/G) + logl(1 + r)
173 // ~=~ N*logl(2) + logl(1/G) + poly(r).
175 // For detailed description see logl or log1pl function, regular path.
178 //==============================================================
179 // Floating Point registers used:
181 // f32 -> f101 (70 registers)
183 // General registers used:
184 // r32 -> r57 (26 registers)
186 // Predicate registers used:
188 // p6 for '0, NaNs, Inf' path
189 // p7 for '+ denormals' path
190 // p8 for 'near 0' path
191 // p9 for 'huges' path
192 // p10 for '- denormals' path
193 // p11 for negative values
196 //==============================================================
201 // C7, C9 'near 0' polynomial coefficients
202 LOCAL_OBJECT_START(Poly_C_near_0_79)
203 data8 0xF8DC939BBEDD5A54, 0x00003FF9
204 data8 0xB6DB6DAB21565AC5, 0x0000BFFA
205 LOCAL_OBJECT_END(Poly_C_near_0_79)
207 // C3, C5 'near 0' polynomial coefficients
208 LOCAL_OBJECT_START(Poly_C_near_0_35)
209 data8 0x999999999991D582, 0x00003FFB
210 data8 0xAAAAAAAAAAAAAAA9, 0x0000BFFC
211 LOCAL_OBJECT_END(Poly_C_near_0_35)
214 LOCAL_OBJECT_START(Constants_Q)
215 data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
216 data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
217 data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
218 data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
219 data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
220 data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
221 LOCAL_OBJECT_END(Constants_Q)
224 LOCAL_OBJECT_START(Constants_Z_1)
241 LOCAL_OBJECT_END(Constants_Z_1)
243 // G1 and H1 - IEEE single and h1 - IEEE double
244 LOCAL_OBJECT_START(Constants_G_H_h1)
245 data4 0x3F800000,0x00000000
246 data8 0x0000000000000000
247 data4 0x3F70F0F0,0x3D785196
248 data8 0x3DA163A6617D741C
249 data4 0x3F638E38,0x3DF13843
250 data8 0x3E2C55E6CBD3D5BB
251 data4 0x3F579430,0x3E2FF9A0
252 data8 0xBE3EB0BFD86EA5E7
253 data4 0x3F4CCCC8,0x3E647FD6
254 data8 0x3E2E6A8C86B12760
255 data4 0x3F430C30,0x3E8B3AE7
256 data8 0x3E47574C5C0739BA
257 data4 0x3F3A2E88,0x3EA30C68
258 data8 0x3E20E30F13E8AF2F
259 data4 0x3F321640,0x3EB9CEC8
260 data8 0xBE42885BF2C630BD
261 data4 0x3F2AAAA8,0x3ECF9927
262 data8 0x3E497F3497E577C6
263 data4 0x3F23D708,0x3EE47FC5
264 data8 0x3E3E6A6EA6B0A5AB
265 data4 0x3F1D89D8,0x3EF8947D
266 data8 0xBDF43E3CD328D9BE
267 data4 0x3F17B420,0x3F05F3A1
268 data8 0x3E4094C30ADB090A
269 data4 0x3F124920,0x3F0F4303
270 data8 0xBE28FBB2FC1FE510
271 data4 0x3F0D3DC8,0x3F183EBF
272 data8 0x3E3A789510FDE3FA
273 data4 0x3F088888,0x3F20EC80
274 data8 0x3E508CE57CC8C98F
275 data4 0x3F042108,0x3F29516A
276 data8 0xBE534874A223106C
277 LOCAL_OBJECT_END(Constants_G_H_h1)
280 LOCAL_OBJECT_START(Constants_Z_2)
297 LOCAL_OBJECT_END(Constants_Z_2)
299 // G2 and H2 - IEEE single and h2 - IEEE double
300 LOCAL_OBJECT_START(Constants_G_H_h2)
301 data4 0x3F800000,0x00000000
302 data8 0x0000000000000000
303 data4 0x3F7F00F8,0x3B7F875D
304 data8 0x3DB5A11622C42273
305 data4 0x3F7E03F8,0x3BFF015B
306 data8 0x3DE620CF21F86ED3
307 data4 0x3F7D08E0,0x3C3EE393
308 data8 0xBDAFA07E484F34ED
309 data4 0x3F7C0FC0,0x3C7E0586
310 data8 0xBDFE07F03860BCF6
311 data4 0x3F7B1880,0x3C9E75D2
312 data8 0x3DEA370FA78093D6
313 data4 0x3F7A2328,0x3CBDC97A
314 data8 0x3DFF579172A753D0
315 data4 0x3F792FB0,0x3CDCFE47
316 data8 0x3DFEBE6CA7EF896B
317 data4 0x3F783E08,0x3CFC15D0
318 data8 0x3E0CF156409ECB43
319 data4 0x3F774E38,0x3D0D874D
320 data8 0xBE0B6F97FFEF71DF
321 data4 0x3F766038,0x3D1CF49B
322 data8 0xBE0804835D59EEE8
323 data4 0x3F757400,0x3D2C531D
324 data8 0x3E1F91E9A9192A74
325 data4 0x3F748988,0x3D3BA322
326 data8 0xBE139A06BF72A8CD
327 data4 0x3F73A0D0,0x3D4AE46F
328 data8 0x3E1D9202F8FBA6CF
329 data4 0x3F72B9D0,0x3D5A1756
330 data8 0xBE1DCCC4BA796223
331 data4 0x3F71D488,0x3D693B9D
332 data8 0xBE049391B6B7C239
333 LOCAL_OBJECT_END(Constants_G_H_h2)
335 // G3 and H3 - IEEE single and h3 - IEEE double
336 LOCAL_OBJECT_START(Constants_G_H_h3)
337 data4 0x3F7FFC00,0x38800100
338 data8 0x3D355595562224CD
339 data4 0x3F7FF400,0x39400480
340 data8 0x3D8200A206136FF6
341 data4 0x3F7FEC00,0x39A00640
342 data8 0x3DA4D68DE8DE9AF0
343 data4 0x3F7FE400,0x39E00C41
344 data8 0xBD8B4291B10238DC
345 data4 0x3F7FDC00,0x3A100A21
346 data8 0xBD89CCB83B1952CA
347 data4 0x3F7FD400,0x3A300F22
348 data8 0xBDB107071DC46826
349 data4 0x3F7FCC08,0x3A4FF51C
350 data8 0x3DB6FCB9F43307DB
351 data4 0x3F7FC408,0x3A6FFC1D
352 data8 0xBD9B7C4762DC7872
353 data4 0x3F7FBC10,0x3A87F20B
354 data8 0xBDC3725E3F89154A
355 data4 0x3F7FB410,0x3A97F68B
356 data8 0xBD93519D62B9D392
357 data4 0x3F7FAC18,0x3AA7EB86
358 data8 0x3DC184410F21BD9D
359 data4 0x3F7FA420,0x3AB7E101
360 data8 0xBDA64B952245E0A6
361 data4 0x3F7F9C20,0x3AC7E701
362 data8 0x3DB4B0ECAABB34B8
363 data4 0x3F7F9428,0x3AD7DD7B
364 data8 0x3D9923376DC40A7E
365 data4 0x3F7F8C30,0x3AE7D474
366 data8 0x3DC6E17B4F2083D3
367 data4 0x3F7F8438,0x3AF7CBED
368 data8 0x3DAE314B811D4394
369 data4 0x3F7F7C40,0x3B03E1F3
370 data8 0xBDD46F21B08F2DB1
371 data4 0x3F7F7448,0x3B0BDE2F
372 data8 0xBDDC30A46D34522B
373 data4 0x3F7F6C50,0x3B13DAAA
374 data8 0x3DCB0070B1F473DB
375 data4 0x3F7F6458,0x3B1BD766
376 data8 0xBDD65DDC6AD282FD
377 data4 0x3F7F5C68,0x3B23CC5C
378 data8 0xBDCDAB83F153761A
379 data4 0x3F7F5470,0x3B2BC997
380 data8 0xBDDADA40341D0F8F
381 data4 0x3F7F4C78,0x3B33C711
382 data8 0x3DCD1BD7EBC394E8
383 data4 0x3F7F4488,0x3B3BBCC6
384 data8 0xBDC3532B52E3E695
385 data4 0x3F7F3C90,0x3B43BAC0
386 data8 0xBDA3961EE846B3DE
387 data4 0x3F7F34A0,0x3B4BB0F4
388 data8 0xBDDADF06785778D4
389 data4 0x3F7F2CA8,0x3B53AF6D
390 data8 0x3DCC3ED1E55CE212
391 data4 0x3F7F24B8,0x3B5BA620
392 data8 0xBDBA31039E382C15
393 data4 0x3F7F1CC8,0x3B639D12
394 data8 0x3D635A0B5C5AF197
395 data4 0x3F7F14D8,0x3B6B9444
396 data8 0xBDDCCB1971D34EFC
397 data4 0x3F7F0CE0,0x3B7393BC
398 data8 0x3DC7450252CD7ADA
399 data4 0x3F7F04F0,0x3B7B8B6D
400 data8 0xBDB68F177D7F2A42
401 LOCAL_OBJECT_END(Constants_G_H_h3)
404 //==============================================================
406 // Floating Point Registers
414 // Special logl registers
466 FR_2_to_minus_N = f76
469 // Huge & Main path prolog registers
487 // Near 0 & Huges path prolog registers
500 // General Purpose Registers
502 // General prolog registers
510 // Near 0 path prolog registers
514 // Special logl registers
543 GLOBAL_LIBM_ENTRY(asinhl)
546 alloc GR_PFS = ar.pfs,0,27,0,0
547 fma.s1 FR_P2 = FR_Arg, FR_Arg, f1 // p2 = x^2 + 1
548 mov GR_Half = 0xfffe // 0.5's exp
551 addl GR_Poly_C_79 = @ltoff(Poly_C_near_0_79), gp // C7, C9 coeffs
552 fma.s1 FR_X2 = FR_Arg, FR_Arg, f0 // Obtain x^2
553 addl GR_Poly_C_35 = @ltoff(Poly_C_near_0_35), gp // C3, C5 coeffs
557 getf.exp GR_ArgExp = FR_Arg // get arument's exponent
558 fabs FR_AX = FR_Arg // absolute value of argument
559 mov GR_TwoN7 = 0xfff8 // 2^-7 exp
562 ld8 GR_Poly_C_79 = [GR_Poly_C_79] // get actual coeff table address
563 fma.s0 FR_Two = f1, f1, f1 // construct 2.0
564 mov GR_ExpMask = 0x1ffff // mask for exp
568 ld8 GR_Poly_C_35 = [GR_Poly_C_35] // get actual coeff table address
569 fclass.m p6,p0 = FR_Arg, 0xe7 // if arg NaN inf zero
570 mov GR_TwoP63 = 0x1003e // 2^63 exp
573 addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp
579 setf.exp FR_Half = GR_Half // construct 0.5
580 fclass.m p7,p0 = FR_Arg, 0x09 // if arg + denorm
581 and GR_ArgExp = GR_ExpMask, GR_ArgExp // select exp
584 ld8 GR_ad_z_1 = [GR_ad_z_1] // Get pointer to Constants_Z_1
589 ldfe FR_C9 = [GR_Poly_C_79],16 // load C9
590 fclass.m p10,p0 = FR_Arg, 0x0a // if arg - denorm
591 cmp.gt p8, p0 = GR_TwoN7, GR_ArgExp // if arg < 2^-7 ('near 0')
594 cmp.le p9, p0 = GR_TwoP63, GR_ArgExp // if arg > 2^63 ('huges')
595 (p6) fma.s0 FR_Res = FR_Arg,f1,FR_Arg // r = a + a
596 (p6) br.ret.spnt b0 // return
598 // (X^2 + 1) computation
600 (p8) ldfe FR_C5 = [GR_Poly_C_35],16 // load C5
601 fms.s1 FR_Tmp = f1, f1, FR_P2 // Tmp = 1 - p2
602 add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1
605 (p8) ldfe FR_C7 = [GR_Poly_C_79],16 // load C7
606 (p7) fnma.s0 FR_Res = FR_Arg,FR_Arg,FR_Arg // r = a - a*a
607 (p7) br.ret.spnt b0 // return
611 (p8) ldfe FR_C3 = [GR_Poly_C_35],16 // load C3
612 fcmp.lt.s1 p11, p12 = FR_Arg, f0 // if arg is negative
613 add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P
616 add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
617 (p10) fma.s0 FR_Res = FR_Arg,FR_Arg,FR_Arg // r = a + a*a
618 (p10) br.ret.spnt b0 // return
622 add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2
623 frsqrta.s1 FR_Rcp, p0 = FR_P2 // Rcp = 1/p2 reciprocal appr.
624 add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3
628 fms.s1 FR_P2L = FR_AX, FR_AX, FR_X2 //low part of p2=fma(X*X-p2)
629 mov GR_Bias = 0x0FFFF // Create exponent bias
634 (p9) fms.s1 FR_XLog_Hi = FR_Two, FR_AX, f0 // Hi of log1p arg = 2*X - 1
635 (p9) br.cond.spnt huges_logl // special version of log1p
639 ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
640 (p8) fma.s1 FR_X3 = FR_X2, FR_Arg, f0 // x^3 = x^2 * x
641 (p8) br.cond.spnt near_0 // Go to near 0 branch
645 ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
651 ldfe FR_Q4 = [GR_ad_q],16 // Load Q4
652 fma.s1 FR_Tmp = FR_Tmp, f1, FR_X2 // Tmp = Tmp + x^2
653 mov GR_exp_mask = 0x1FFFF // Create exponent mask
657 ldfe FR_Q3 = [GR_ad_q],16 // Load Q3
658 fma.s1 FR_GG = FR_Rcp, FR_P2, f0 // g = Rcp * p2
659 // 8 bit Newton Raphson iteration
664 fma.s1 FR_HH = FR_Half, FR_Rcp, f0 // h = 0.5 * Rcp
668 ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
669 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
674 fma.s1 FR_P2L = FR_Tmp, f1, FR_P2L // low part of p2 = Tmp + p2l
679 ldfe FR_Q1 = [GR_ad_q] // Load Q1
680 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
681 // 16 bit Newton Raphson iteration
686 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
692 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
698 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
699 // 32 bit Newton Raphson iteration
704 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
710 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
716 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
717 // 64 bit Newton Raphson iteration
722 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
728 fnma.s1 FR_DD = FR_GG, FR_GG, FR_P2 // Remainder d = g * g - p2
733 fma.s1 FR_XLog_Hi = FR_AX, f1, FR_GG // bh = z + gh
739 fma.s1 FR_DD = FR_DD, f1, FR_P2L // add p2l: d = d + p2l
745 getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1
746 fmerge.ns FR_Neg_One = f1, f1 // Form -1.0
747 mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7
752 fma.s1 FR_GL = FR_DD, FR_HH, f0 // gl = d * h
753 extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif
757 fma.s1 FR_XLog_Hi = FR_DD, FR_HH, FR_XLog_Hi // bh = bh + gl
762 shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
763 shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1
764 extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif.
768 ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
774 ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
781 fms.s1 FR_XLog_Lo = FR_GG, f1, FR_XLog_Hi // bl = gh - bh
782 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1
785 // WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
796 fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x+1|
801 getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1
802 ldfd FR_h = [GR_ad_tbl_1] // Load h_1
809 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
814 shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2
815 fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_AX // bl = bl + x
816 mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80
819 shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
821 sub GR_N = GR_N, GR_Bias // sub bias from exp
825 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
826 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
827 sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N)
831 ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
837 setf.sig FR_float_N = GR_N // Put integer N into rightmost sign
838 setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N)
839 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
842 // WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
843 // BECAUSE OF POSSIBLE 10 CLOCKS STALL!
844 // So we can negate Q coefficients there for negative values
848 (p11) fma.s1 FR_Q1 = FR_Q1, FR_Neg_One, f0 // Negate Q1
853 fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_GL // bl = bl + gl
859 (p11) fma.s1 FR_Q2 = FR_Q2, FR_Neg_One, f0 // Negate Q2
865 (p11) fma.s1 FR_Q3 = FR_Q3, FR_Neg_One, f0 // Negate Q3
871 (p11) fma.s1 FR_Q4 = FR_Q4, FR_Neg_One, f0 // Negate Q4
872 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
876 shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3
882 ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
888 ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
889 fcvt.xf FR_float_N = FR_float_N
895 fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
900 fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
906 fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
911 fma.s1 FR_S_lo = FR_XLog_Lo, FR_2_to_minus_N, f0 //S_lo=S_lo*2^-N
917 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
922 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
928 fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
934 fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1
939 fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
945 fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h=N*log2_lo+h
950 fma.s1 FR_r = FR_G, FR_S_lo, FR_r // r=G*S_lo+(G*S_hi-1)
956 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3
961 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
967 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
972 fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
976 .pred.rel "mutex",p12,p11
979 (p12) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
984 (p11) fms.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
989 .pred.rel "mutex",p12,p11
992 (p12) fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
997 (p11) fms.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1004 fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo
1005 // Y_lo=poly_hi+poly_lo
1010 (p11) fma.s0 FR_Y_hi = FR_Y_hi, FR_Neg_One, f0 // FR_Y_hi sign for neg
1016 fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi
1017 br.ret.sptk b0 // Common exit for 2^-7 < x < inf
1020 // * SPECIAL VERSION OF LOGL FOR HUGE ARGUMENTS *
1024 getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1
1025 fmerge.ns FR_Neg_One = f1, f1 // Form -1.0
1026 mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7
1030 add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1
1032 add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P
1035 add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
1037 add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2
1043 extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif
1046 add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3
1052 shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
1054 extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif.
1058 ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
1060 mov GR_exp_mask = 0x1FFFF // Create exponent mask
1063 shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1
1065 mov GR_Bias = 0x0FFFF // Create exponent bias
1069 ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
1070 fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x+1|
1075 getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1
1076 ldfd FR_h = [GR_ad_tbl_1] // Load h_1
1081 ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
1083 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1
1086 // WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
1090 ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
1091 sub GR_N = GR_N, GR_Bias
1092 mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80
1096 ldfe FR_Q4 = [GR_ad_q],16 // Load Q4
1098 sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N)
1102 ldfe FR_Q3 = [GR_ad_q],16 // Load Q3
1103 setf.sig FR_float_N = GR_N // Put integer N into rightmost sign
1109 ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
1110 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
1114 ldfe FR_Q1 = [GR_ad_q] // Load Q1
1115 shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
1120 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
1121 shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2
1126 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
1132 ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
1137 setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N)
1145 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
1148 // WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
1150 // JUST HAVE TO INSERT 3 NOP CYCLES (nothing to do here)
1172 (p11) fma.s1 FR_Q4 = FR_Q4, FR_Neg_One, f0 // Negate Q4
1173 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
1177 shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3
1178 fcvt.xf FR_float_N = FR_float_N
1183 (p11) fma.s1 FR_Q3 = FR_Q3, FR_Neg_One, f0 // Negate Q3
1188 ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
1189 (p11) fma.s1 FR_Q2 = FR_Q2, FR_Neg_One, f0 // Negate Q2
1194 (p11) fma.s1 FR_Q1 = FR_Q1, FR_Neg_One, f0 // Negate Q1
1199 ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
1200 fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
1205 fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
1212 fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
1217 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
1222 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
1228 fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
1234 fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1
1239 fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
1245 fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h=N*log2_lo+h
1251 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3
1256 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
1262 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
1267 fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
1271 .pred.rel "mutex",p12,p11
1274 (p12) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1279 (p11) fms.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1284 .pred.rel "mutex",p12,p11
1287 (p12) fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1292 (p11) fms.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1298 fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo=poly_hi+poly_lo
1303 (p11) fma.s0 FR_Y_hi = FR_Y_hi, FR_Neg_One, f0 // FR_Y_hi sign for neg
1309 fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi
1310 br.ret.sptk b0 // Common exit for 2^-7 < x < inf
1313 // NEAR ZERO POLYNOMIAL INTERVAL
1317 fma.s1 FR_X4 = FR_X2, FR_X2, f0 // x^4 = x^2 * x^2
1323 fma.s1 FR_P9 = FR_C9,FR_X2,FR_C7 // p9 = C9*x^2 + C7
1328 fma.s1 FR_P5 = FR_C5,FR_X2,FR_C3 // p5 = C5*x^2 + C3
1334 fma.s1 FR_P3 = FR_P9,FR_X4,FR_P5 // p3 = p9*x^4 + p5
1340 fma.s0 FR_Res = FR_P3,FR_X3,FR_Arg // res = p3*C3 + x
1341 br.ret.sptk b0 // Near 0 path return
1344 GLOBAL_LIBM_END(asinhl)