4 // Copyright (c) 2000 - 2005, Intel Corporation
5 // All rights reserved.
8 // Redistribution and use in source and binary forms, with or without
9 // modification, are permitted provided that the following conditions are
12 // * Redistributions of source code must retain the above copyright
13 // notice, this list of conditions and the following disclaimer.
15 // * Redistributions in binary form must reproduce the above copyright
16 // notice, this list of conditions and the following disclaimer in the
17 // documentation and/or other materials provided with the distribution.
19 // * The name of Intel Corporation may not be used to endorse or promote
20 // products derived from this software without specific prior written
23 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 // Intel Corporation is the author of this code, and requests that all
36 // problem reports or change requests be submitted to it directly at
37 // http://www.intel.com/software/products/opensource/libraries/num.htm.
39 //*********************************************************************
42 // 10/01/01 Initial version
43 // 10/10/01 Performance inproved
44 // 12/11/01 Changed huges_logp to not be global
45 // 01/02/02 Corrected .restore syntax
46 // 05/20/02 Cleaned up namespace and sf0 syntax
47 // 08/14/02 Changed mli templates to mlx
48 // 02/06/03 Reorganized data tables
49 // 03/31/05 Reformatted delimiters between data tables
51 //*********************************************************************
54 //==============================================================
55 // long double acoshl(long double);
57 // Overview of operation
58 //==============================================================
62 // Return acoshl(x) = 0;
65 // Return acoshl(x) = Nan (Domain error, error handler call with tag 135);
67 // 3. x = [S,Q]Nan or +INF
68 // Return acoshl(x) = x + x;
70 // 4. 'Near 1': 1 < x < 1+1/8
71 // Return acoshl(x) = sqrtl(2*y)*(1-P(y)/Q(y)),
72 // where y = 1, P(y)/Q(y) - rational approximation
74 // 5. 'Huges': x > 0.5*2^64
75 // Return acoshl(x) = (logl(2*x-1));
77 // 6. 'Main path': 1+1/8 < x < 0.5*2^64
78 // b_hi + b_lo = x + sqrt(x^2 - 1);
79 // acoshl(x) = logl_special(b_hi, b_lo);
81 // Algorithm description
82 //==============================================================
84 // I. Near 1 path algorithm
85 // **************************************************************
86 // The formula is acoshl(x) = sqrtl(2*y)*(1-P(y)/Q(y)),
87 // where y = 1, P(y)/Q(y) - rational approximation
89 // 1) y = x - 1, y2 = 2 * y
91 // 2) Compute in parallel sqrtl(2*y) and P(y)/Q(y)
92 // a) sqrtl computation method described below (main path algorithm, item 2))
93 // As result we obtain (gg+gl) - multiprecision result
94 // as pair of double extended values
95 // b) P(y) and Q(y) calculated without any extra precision manipulations
97 // y = frcpa(Q) initial approximation of 1/Q
98 // z = P*y initial approximation of P/Q
103 // y1 = y + y*e2 = y + y*(e+e^2)
106 // y2 = y + y1*e3 = y + y*(e+e^2+..+e^6)
110 // xx = z + r*y2 high part of a/b
114 // xl = r1*y3 low part of a/b
116 // 3) res = sqrt(2*y) - sqrt(2*y)*(P(y)/Q(y)) =
117 // = (gg+gl) - (gg + gl)*(xx+xl);
119 // a) hh = gg*xx; hl = gg*xl; lh = gl*xx; ll = gl*xl;
120 // b) res = ((((gl + ll) + lh) + hl) + hh) + gg;
121 // (exactly in this order)
123 // II. Main path algorithm
124 // ( thanks to Peter Markstein for the idea of sqrt(x^2+1) computation! )
125 // **********************************************************************
127 // There are 3 parts of x+sqrt(x^2-1) computation:
129 // 1) m2 = (m2_hi+m2_lo) = x^2-1 obtaining
130 // ------------------------------------
131 // m2_hi = x2_hi - 1, where x2_hi = x * x;
132 // m2_lo = x2_lo + p1_lo, where
133 // x2_lo = FMS(x*x-x2_hi),
134 // p1_lo = (1 + m2_hi) - x2_hi;
136 // 2) g = (g_hi+g_lo) = sqrt(m2) = sqrt(m2_hi+m2_lo)
137 // ----------------------------------------------
138 // r = invsqrt(m2_hi) (8-bit reciprocal square root approximation);
139 // g = m2_hi * r (first 8 bit-approximation of sqrt);
143 // g = g * e + g (second 16 bit-approximation of sqrt);
147 // g = g * e + g (third 32 bit-approximation of sqrt);
151 // g_hi = g * e + g (fourth 64 bit-approximation of sqrt);
153 // Remainder computation:
155 // d = (m2_hi - g_hi * g_hi) + m2_lo;
158 // 3) b = (b_hi + b_lo) = x + g, where g = (g_hi + g_lo) = sqrt(x^2-1)
159 // -------------------------------------------------------------------
160 // b_hi = (g_hi + x) + gl;
161 // b_lo = (x - b_hi) + g_hi + gl;
163 // Now we pass b presented as sum b_hi + b_lo to special version
164 // of logl function which accept a pair of arguments as
165 // mutiprecision value.
167 // Special log algorithm overview
168 // ================================
169 // Here we use a table lookup method. The basic idea is that in
170 // order to compute logl(Arg) for an argument Arg in [1,2),
171 // we construct a value G such that G*Arg is close to 1 and that
172 // logl(1/G) is obtainable easily from a table of values calculated
175 // logl(Arg) = logl(1/G) + logl((G*Arg - 1))
177 // Because |G*Arg - 1| is small, the second term on the right hand
178 // side can be approximated by a short polynomial. We elaborate
179 // this method in four steps.
181 // Step 0: Initialization
183 // We need to calculate logl( X+1 ). Obtain N, S_hi such that
185 // X = 2^N * ( S_hi + S_lo ) exactly
187 // where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
188 // that |S_lo| <= ulp(S_hi).
190 // For the special version of logl: S_lo = b_lo
191 // !-----------------------------------------------!
193 // Step 1: Argument Reduction
195 // Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
197 // G := G_1 * G_2 * G_3
198 // r := (G * S_hi - 1) + G * S_lo
200 // These G_j's have the property that the product is exactly
201 // representable and that |r| < 2^(-12) as a result.
203 // Step 2: Approximation
205 // logl(1 + r) is approximated by a short polynomial poly(r).
207 // Step 3: Reconstruction
209 // Finally, logl( X ) = logl( X+1 ) is given by
211 // logl( X ) = logl( 2^N * (S_hi + S_lo) )
212 // ~=~ N*logl(2) + logl(1/G) + logl(1 + r)
213 // ~=~ N*logl(2) + logl(1/G) + poly(r).
215 // For detailed description see logl or log1pl function, regular path.
218 //==============================================================
219 // Floating Point registers used:
221 // f32 -> f95 (64 registers)
223 // General registers used:
224 // r32 -> r67 (36 registers)
226 // Predicate registers used:
228 // p7 for 'NaNs, Inf' path
229 // p8 for 'near 1' path
230 // p9 for 'huges' path
234 //*********************************************************************
235 // IEEE Special Conditions:
237 // acoshl(+inf) = +inf
238 // acoshl(-inf) = QNaN
240 // acoshl(x<1) = QNaN
241 // acoshl(SNaN) = QNaN
242 // acoshl(QNaN) = QNaN
246 //==============================================================
251 // Near 1 path rational approximation coefficients
252 LOCAL_OBJECT_START(Poly_P)
253 data8 0xB0978143F695D40F, 0x3FF1 // .84205539791447100108478906277453574946e-4
254 data8 0xB9800D841A8CAD29, 0x3FF6 // .28305085180397409672905983082168721069e-2
255 data8 0xC889F455758C1725, 0x3FF9 // .24479844297887530847660233111267222945e-1
256 data8 0x9BE1DFF006F45F12, 0x3FFB // .76114415657565879842941751209926938306e-1
257 data8 0x9E34AF4D372861E0, 0x3FFB // .77248925727776366270605984806795850504e-1
258 data8 0xF3DC502AEE14C4AE, 0x3FA6 // .3077953476682583606615438814166025592e-26
259 LOCAL_OBJECT_END(Poly_P)
262 LOCAL_OBJECT_START(Poly_Q)
263 data8 0xF76E3FD3C7680357, 0x3FF1 // .11798413344703621030038719253730708525e-3
264 data8 0xD107D2E7273263AE, 0x3FF7 // .63791065024872525660782716786703188820e-2
265 data8 0xB609BE5CDE206AEF, 0x3FFB // .88885771950814004376363335821980079985e-1
266 data8 0xF7DEACAC28067C8A, 0x3FFD // .48412074662702495416825113623936037072302
267 data8 0x8F9BE5890CEC7E38, 0x3FFF // 1.1219450873557867470217771071068369729526
268 data8 0xED4F06F3D2BC92D1, 0x3FFE // .92698710873331639524734537734804056798748
269 LOCAL_OBJECT_END(Poly_Q)
272 LOCAL_OBJECT_START(Constants_Q)
273 data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
274 data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
275 data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
276 data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
277 data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
278 data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
279 LOCAL_OBJECT_END(Constants_Q)
282 LOCAL_OBJECT_START(Constants_Z_1)
299 LOCAL_OBJECT_END(Constants_Z_1)
301 // G1 and H1 - IEEE single and h1 - IEEE double
302 LOCAL_OBJECT_START(Constants_G_H_h1)
303 data4 0x3F800000,0x00000000
304 data8 0x0000000000000000
305 data4 0x3F70F0F0,0x3D785196
306 data8 0x3DA163A6617D741C
307 data4 0x3F638E38,0x3DF13843
308 data8 0x3E2C55E6CBD3D5BB
309 data4 0x3F579430,0x3E2FF9A0
310 data8 0xBE3EB0BFD86EA5E7
311 data4 0x3F4CCCC8,0x3E647FD6
312 data8 0x3E2E6A8C86B12760
313 data4 0x3F430C30,0x3E8B3AE7
314 data8 0x3E47574C5C0739BA
315 data4 0x3F3A2E88,0x3EA30C68
316 data8 0x3E20E30F13E8AF2F
317 data4 0x3F321640,0x3EB9CEC8
318 data8 0xBE42885BF2C630BD
319 data4 0x3F2AAAA8,0x3ECF9927
320 data8 0x3E497F3497E577C6
321 data4 0x3F23D708,0x3EE47FC5
322 data8 0x3E3E6A6EA6B0A5AB
323 data4 0x3F1D89D8,0x3EF8947D
324 data8 0xBDF43E3CD328D9BE
325 data4 0x3F17B420,0x3F05F3A1
326 data8 0x3E4094C30ADB090A
327 data4 0x3F124920,0x3F0F4303
328 data8 0xBE28FBB2FC1FE510
329 data4 0x3F0D3DC8,0x3F183EBF
330 data8 0x3E3A789510FDE3FA
331 data4 0x3F088888,0x3F20EC80
332 data8 0x3E508CE57CC8C98F
333 data4 0x3F042108,0x3F29516A
334 data8 0xBE534874A223106C
335 LOCAL_OBJECT_END(Constants_G_H_h1)
338 LOCAL_OBJECT_START(Constants_Z_2)
355 LOCAL_OBJECT_END(Constants_Z_2)
357 // G2 and H2 - IEEE single and h2 - IEEE double
358 LOCAL_OBJECT_START(Constants_G_H_h2)
359 data4 0x3F800000,0x00000000
360 data8 0x0000000000000000
361 data4 0x3F7F00F8,0x3B7F875D
362 data8 0x3DB5A11622C42273
363 data4 0x3F7E03F8,0x3BFF015B
364 data8 0x3DE620CF21F86ED3
365 data4 0x3F7D08E0,0x3C3EE393
366 data8 0xBDAFA07E484F34ED
367 data4 0x3F7C0FC0,0x3C7E0586
368 data8 0xBDFE07F03860BCF6
369 data4 0x3F7B1880,0x3C9E75D2
370 data8 0x3DEA370FA78093D6
371 data4 0x3F7A2328,0x3CBDC97A
372 data8 0x3DFF579172A753D0
373 data4 0x3F792FB0,0x3CDCFE47
374 data8 0x3DFEBE6CA7EF896B
375 data4 0x3F783E08,0x3CFC15D0
376 data8 0x3E0CF156409ECB43
377 data4 0x3F774E38,0x3D0D874D
378 data8 0xBE0B6F97FFEF71DF
379 data4 0x3F766038,0x3D1CF49B
380 data8 0xBE0804835D59EEE8
381 data4 0x3F757400,0x3D2C531D
382 data8 0x3E1F91E9A9192A74
383 data4 0x3F748988,0x3D3BA322
384 data8 0xBE139A06BF72A8CD
385 data4 0x3F73A0D0,0x3D4AE46F
386 data8 0x3E1D9202F8FBA6CF
387 data4 0x3F72B9D0,0x3D5A1756
388 data8 0xBE1DCCC4BA796223
389 data4 0x3F71D488,0x3D693B9D
390 data8 0xBE049391B6B7C239
391 LOCAL_OBJECT_END(Constants_G_H_h2)
393 // G3 and H3 - IEEE single and h3 - IEEE double
394 LOCAL_OBJECT_START(Constants_G_H_h3)
395 data4 0x3F7FFC00,0x38800100
396 data8 0x3D355595562224CD
397 data4 0x3F7FF400,0x39400480
398 data8 0x3D8200A206136FF6
399 data4 0x3F7FEC00,0x39A00640
400 data8 0x3DA4D68DE8DE9AF0
401 data4 0x3F7FE400,0x39E00C41
402 data8 0xBD8B4291B10238DC
403 data4 0x3F7FDC00,0x3A100A21
404 data8 0xBD89CCB83B1952CA
405 data4 0x3F7FD400,0x3A300F22
406 data8 0xBDB107071DC46826
407 data4 0x3F7FCC08,0x3A4FF51C
408 data8 0x3DB6FCB9F43307DB
409 data4 0x3F7FC408,0x3A6FFC1D
410 data8 0xBD9B7C4762DC7872
411 data4 0x3F7FBC10,0x3A87F20B
412 data8 0xBDC3725E3F89154A
413 data4 0x3F7FB410,0x3A97F68B
414 data8 0xBD93519D62B9D392
415 data4 0x3F7FAC18,0x3AA7EB86
416 data8 0x3DC184410F21BD9D
417 data4 0x3F7FA420,0x3AB7E101
418 data8 0xBDA64B952245E0A6
419 data4 0x3F7F9C20,0x3AC7E701
420 data8 0x3DB4B0ECAABB34B8
421 data4 0x3F7F9428,0x3AD7DD7B
422 data8 0x3D9923376DC40A7E
423 data4 0x3F7F8C30,0x3AE7D474
424 data8 0x3DC6E17B4F2083D3
425 data4 0x3F7F8438,0x3AF7CBED
426 data8 0x3DAE314B811D4394
427 data4 0x3F7F7C40,0x3B03E1F3
428 data8 0xBDD46F21B08F2DB1
429 data4 0x3F7F7448,0x3B0BDE2F
430 data8 0xBDDC30A46D34522B
431 data4 0x3F7F6C50,0x3B13DAAA
432 data8 0x3DCB0070B1F473DB
433 data4 0x3F7F6458,0x3B1BD766
434 data8 0xBDD65DDC6AD282FD
435 data4 0x3F7F5C68,0x3B23CC5C
436 data8 0xBDCDAB83F153761A
437 data4 0x3F7F5470,0x3B2BC997
438 data8 0xBDDADA40341D0F8F
439 data4 0x3F7F4C78,0x3B33C711
440 data8 0x3DCD1BD7EBC394E8
441 data4 0x3F7F4488,0x3B3BBCC6
442 data8 0xBDC3532B52E3E695
443 data4 0x3F7F3C90,0x3B43BAC0
444 data8 0xBDA3961EE846B3DE
445 data4 0x3F7F34A0,0x3B4BB0F4
446 data8 0xBDDADF06785778D4
447 data4 0x3F7F2CA8,0x3B53AF6D
448 data8 0x3DCC3ED1E55CE212
449 data4 0x3F7F24B8,0x3B5BA620
450 data8 0xBDBA31039E382C15
451 data4 0x3F7F1CC8,0x3B639D12
452 data8 0x3D635A0B5C5AF197
453 data4 0x3F7F14D8,0x3B6B9444
454 data8 0xBDDCCB1971D34EFC
455 data4 0x3F7F0CE0,0x3B7393BC
456 data8 0x3DC7450252CD7ADA
457 data4 0x3F7F04F0,0x3B7B8B6D
458 data8 0xBDB68F177D7F2A42
459 LOCAL_OBJECT_END(Constants_G_H_h3)
462 //==============================================================
464 // Floating Point Registers
513 // Special logl registers
544 FR_2_to_minus_N = f86
584 // Error handler registers
589 // General Purpose Registers
591 // General prolog registers
598 // Near 1 path registers
602 // Special logl registers
629 // Added for unwind support
637 GR_Parameter_RESULT = r66
638 GR_Parameter_TAG = r67
643 GLOBAL_LIBM_ENTRY(acoshl)
646 alloc GR_PFS = ar.pfs,0,32,4,0 // Local frame allocation
647 fcmp.lt.s1 p11, p0 = FR_Arg, f1 // if arg is less than 1
648 mov GR_Half = 0xfffe // 0.5's exp
651 addl GR_Poly_Q = @ltoff(Poly_Q), gp // Address of Q-coeff table
652 fma.s1 FR_X2 = FR_Arg, FR_Arg, f0 // Obtain x^2
653 addl GR_Poly_P = @ltoff(Poly_P), gp // Address of P-coeff table
657 getf.d GR_Arg = FR_Arg // get argument as double (int64)
658 fma.s0 FR_Two = f1, f1, f1 // construct 2.0
659 addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp // logl tables
663 movl GR_TwoP63 = 0x43E8000000000000 // 0.5*2^63 (huge arguments)
667 ld8 GR_Poly_P = [GR_Poly_P] // get actual P-coeff table address
668 fcmp.eq.s1 p10, p0 = FR_Arg, f1 // if arg == 1 (return 0)
672 ld8 GR_Poly_Q = [GR_Poly_Q] // get actual Q-coeff table address
673 movl GR_OneP125 = 0x3FF2000000000000 // 1.125 (near 1 path bound)
677 ld8 GR_ad_z_1 = [GR_ad_z_1] // Get pointer to Constants_Z_1
678 fclass.m p7,p0 = FR_Arg, 0xe3 // if arg NaN inf
679 cmp.le p9, p0 = GR_TwoP63, GR_Arg // if arg > 0.5*2^63 ('huges')
682 cmp.ge p8, p0 = GR_OneP125, GR_Arg // if arg<1.125 -near 1 path
683 fms.s1 FR_XM1 = FR_Arg, f1, f1 // X0 = X-1 (for near 1 path)
684 (p11) br.cond.spnt acoshl_lt_pone // error branch (less than 1)
688 setf.exp FR_Half = GR_Half // construct 0.5
689 (p9) setf.s FR_XLog_Lo = r0 // Low of logl arg=0 (Huges path)
690 mov GR_exp_mask = 0x1FFFF // Create exponent mask
694 (p8) ldfe FR_PP5 = [GR_Poly_P],16 // Load P5
695 (p8) ldfe FR_QQ5 = [GR_Poly_Q],16 // Load Q5
696 fms.s1 FR_M2 = FR_X2, f1, f1 // m2 = x^2 - 1
700 (p8) ldfe FR_QQ4 = [GR_Poly_Q],16 // Load Q4
701 fms.s1 FR_M2L = FR_Arg, FR_Arg, FR_X2 // low part of
702 // m2 = fma(X*X - m2)
703 add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1
706 (p8) ldfe FR_PP4 = [GR_Poly_P],16 // Load P4
707 (p7) fma.s0 FR_Res = FR_Arg,f1,FR_Arg // r = a + a (Nan, Inf)
708 (p7) br.ret.spnt b0 // return (Nan, Inf)
712 (p8) ldfe FR_PP3 = [GR_Poly_P],16 // Load P3
714 add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P
717 (p8) ldfe FR_QQ3 = [GR_Poly_Q],16 // Load Q3
718 (p9) fms.s1 FR_XLog_Hi = FR_Two, FR_Arg, f1 // Hi of log arg = 2*X-1
719 (p9) br.cond.spnt huges_logl // special version of log
724 (p8) ldfe FR_PP2 = [GR_Poly_P],16 // Load P2
725 (p8) fma.s1 FR_2XM1 = FR_Two, FR_XM1, f0 // 2X0 = 2 * X0
726 add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
729 (p8) ldfe FR_QQ2 = [GR_Poly_Q],16 // Load Q2
730 (p10) fma.s0 FR_Res = f0,f1,f0 // r = 0 (arg = 1)
731 (p10) br.ret.spnt b0 // return (arg = 1)
735 (p8) ldfe FR_PP1 = [GR_Poly_P],16 // Load P1
736 (p8) ldfe FR_QQ1 = [GR_Poly_Q],16 // Load Q1
737 add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2
742 (p8) ldfe FR_PP0 = [GR_Poly_P] // Load P0
743 fma.s1 FR_Tmp = f1, f1, FR_M2 // Tmp = 1 + m2
744 add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3
747 (p8) ldfe FR_QQ0 = [GR_Poly_Q]
749 (p8) br.cond.spnt near_1 // near 1 path
752 ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
754 mov GR_Bias = 0x0FFFF // Create exponent bias
758 frsqrta.s1 FR_Rcp, p0 = FR_M2 // Rcp = 1/m2 reciprocal appr.
763 ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
764 fms.s1 FR_Tmp = FR_X2, f1, FR_Tmp // Tmp = x^2 - Tmp
769 ldfe FR_Q4 = [GR_ad_q],16 // Load Q4
770 fma.s1 FR_GG = FR_Rcp, FR_M2, f0 // g = Rcp * m2
771 // 8 bit Newton Raphson iteration
776 fma.s1 FR_HH = FR_Half, FR_Rcp, f0 // h = 0.5 * Rcp
780 ldfe FR_Q3 = [GR_ad_q],16 // Load Q3
781 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
786 fma.s1 FR_M2L = FR_Tmp, f1, FR_M2L // low part of m2 = Tmp+m2l
791 ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
792 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
793 // 16 bit Newton Raphson iteration
798 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
803 ldfe FR_Q1 = [GR_ad_q] // Load Q1
804 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
809 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
810 // 32 bit Newton Raphson iteration
815 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
821 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
827 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
828 // 64 bit Newton Raphson iteration
833 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
839 fnma.s1 FR_DD = FR_GG, FR_GG, FR_M2 // Remainder d = g * g - p2
844 fma.s1 FR_XLog_Hi = FR_Arg, f1, FR_GG // bh = z + gh
850 fma.s1 FR_DD = FR_DD, f1, FR_M2L // add p2l: d = d + p2l
855 getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1
857 mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7
862 fma.s1 FR_GL = FR_DD, FR_HH, f0 // gl = d * h
863 extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif
867 fma.s1 FR_XLog_Hi = FR_DD, FR_HH, FR_XLog_Hi // bh = bh + gl
874 shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
875 shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1
876 extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif.
880 ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
886 ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
893 fms.s1 FR_XLog_Lo = FR_Arg, f1, FR_XLog_Hi // bl = x - bh
894 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1
897 // WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
908 fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x+1|
914 getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1
915 ldfd FR_h = [GR_ad_tbl_1] // Load h_1
922 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
926 shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2
927 fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_GG // bl = bl + gg
928 mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80
931 shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
933 sub GR_N = GR_N, GR_Bias // sub bias from exp
937 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
938 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
939 sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N)
943 ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
949 setf.sig FR_float_N = GR_N // Put integer N into rightmost sign
950 setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N)
951 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
954 // WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
955 // BECAUSE OF POSSIBLE 10 CLOCKS STALL!
956 // (Just nops added - nothing to do here)
960 fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_GL // bl = bl + gl
977 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
981 shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3
987 ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
993 ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
994 fcvt.xf FR_float_N = FR_float_N
1000 fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
1005 fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
1011 fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
1016 fma.s1 FR_S_lo = FR_XLog_Lo, FR_2_to_minus_N, f0 //S_lo=S_lo*2^(-N)
1022 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
1027 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
1033 fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
1039 fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1
1044 fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
1050 fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h=N*log2_lo+h
1055 fma.s1 FR_r = FR_G, FR_S_lo, FR_r // r=G*S_lo+(G*S_hi-1)
1061 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3
1066 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
1072 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
1077 fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
1083 fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1089 fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1095 fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo
1096 // Y_lo=poly_hi+poly_lo
1102 fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi
1103 br.ret.sptk b0 // Common exit for 2^-7 < x < inf
1109 getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1
1110 mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7
1115 add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1
1117 add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P
1120 add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
1122 add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2
1126 add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3
1128 extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif
1132 shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
1134 extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif.
1138 ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
1140 mov GR_exp_mask = 0x1FFFF // Create exponent mask
1143 shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1
1145 mov GR_Bias = 0x0FFFF // Create exponent bias
1149 ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
1150 fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x|
1155 getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1
1156 ldfd FR_h = [GR_ad_tbl_1] // Load h_1
1161 ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
1163 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1
1167 ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
1168 sub GR_N = GR_N, GR_Bias
1169 mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80
1173 ldfe FR_Q4 = [GR_ad_q],16 // Load Q4
1175 sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N)
1179 ldfe FR_Q3 = [GR_ad_q],16 // Load Q3
1180 setf.sig FR_float_N = GR_N // Put integer N into rightmost sign
1185 ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
1187 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
1191 ldfe FR_Q1 = [GR_ad_q] // Load Q1
1192 shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
1197 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
1198 shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2
1203 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
1209 ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
1210 setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N)
1217 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1*Z_2
1220 // WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
1221 // BECAUSE OF POSSIBLE 10 CLOCKS STALL!
1222 // (Just nops added - nothing to do here)
1245 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
1249 shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3
1250 fcvt.xf FR_float_N = FR_float_N
1255 ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
1261 ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
1262 fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
1267 fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
1274 fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
1279 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2)*G_3
1284 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2)+H_3
1290 fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
1296 fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1
1301 fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
1307 fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h = N*log2_lo+h
1313 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3
1318 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
1324 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
1329 fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
1335 fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1341 fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1346 fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo=poly_hi+poly_lo
1351 fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi
1352 br.ret.sptk b0 // Common exit
1356 // NEAR ONE INTERVAL
1360 frsqrta.s1 FR_Rcp, p0 = FR_2XM1 // Rcp = 1/x reciprocal appr. &SQRT&
1366 fma.s1 FR_PV6 = FR_PP5, FR_XM1, FR_PP4 // pv6 = P5*xm1+P4 $POLY$
1371 fma.s1 FR_QV6 = FR_QQ5, FR_XM1, FR_QQ4 // qv6 = Q5*xm1+Q4 $POLY$
1377 fma.s1 FR_PV4 = FR_PP3, FR_XM1, FR_PP2 // pv4 = P3*xm1+P2 $POLY$
1382 fma.s1 FR_QV4 = FR_QQ3, FR_XM1, FR_QQ2 // qv4 = Q3*xm1+Q2 $POLY$
1388 fma.s1 FR_XM12 = FR_XM1, FR_XM1, f0 // xm1^2 = xm1 * xm1 $POLY$
1394 fma.s1 FR_PV2 = FR_PP1, FR_XM1, FR_PP0 // pv2 = P1*xm1+P0 $POLY$
1399 fma.s1 FR_QV2 = FR_QQ1, FR_XM1, FR_QQ0 // qv2 = Q1*xm1+Q0 $POLY$
1405 fma.s1 FR_GG = FR_Rcp, FR_2XM1, f0 // g = Rcp * x &SQRT&
1410 fma.s1 FR_HH = FR_Half, FR_Rcp, f0 // h = 0.5 * Rcp &SQRT&
1417 fma.s1 FR_PV3 = FR_XM12, FR_PV6, FR_PV4//pv3=pv6*xm1^2+pv4 $POLY$
1422 fma.s1 FR_QV3 = FR_XM12, FR_QV6, FR_QV4//qv3=qv6*xm1^2+qv4 $POLY$
1429 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h &SQRT&
1435 fma.s1 FR_PP = FR_XM12, FR_PV3, FR_PV2 //pp=pv3*xm1^2+pv2 $POLY$
1440 fma.s1 FR_QQ = FR_XM12, FR_QV3, FR_QV2 //qq=qv3*xm1^2+qv2 $POLY$
1446 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g &SQRT&
1451 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h &SQRT&
1457 frcpa.s1 FR_Y0,p0 = f1,FR_QQ // y = frcpa(b) #DIV#
1462 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g*h &SQRT&
1468 fma.s1 FR_Q0 = FR_PP,FR_Y0,f0 // q = a*y #DIV#
1473 fnma.s1 FR_E0 = FR_Y0,FR_QQ,f1 // e = 1 - b*y #DIV#
1479 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g &SQRT&
1484 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h &SQRT&
1490 fma.s1 FR_E2 = FR_E0,FR_E0,FR_E0 // e2 = e+e^2 #DIV#
1495 fma.s1 FR_E1 = FR_E0,FR_E0,f0 // e1 = e^2 #DIV#
1501 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h &SQRT&
1506 fnma.s1 FR_DD = FR_GG, FR_GG, FR_2XM1 // d = x - g * g &SQRT&
1512 fma.s1 FR_Y1 = FR_Y0,FR_E2,FR_Y0 // y1 = y+y*e2 #DIV#
1517 fma.s1 FR_E3 = FR_E1,FR_E1,FR_E0 // e3 = e+e1^2 #DIV#
1523 fma.s1 FR_GG = FR_DD, FR_HH, FR_GG // g = d * h + g &SQRT&
1528 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h &SQRT&
1534 fma.s1 FR_Y2 = FR_Y1,FR_E3,FR_Y0 // y2 = y+y1*e3 #DIV#
1539 fnma.s1 FR_R0 = FR_QQ,FR_Q0,FR_PP // r = a-b*q #DIV#
1545 fnma.s1 FR_DD = FR_GG, FR_GG, FR_2XM1 // d = x - g * g &SQRT&
1551 fnma.s1 FR_E4 = FR_QQ,FR_Y2,f1 // e4 = 1-b*y2 #DIV#
1556 fma.s1 FR_X_Hi = FR_R0,FR_Y2,FR_Q0 // x = q+r*y2 #DIV#
1562 fma.s1 FR_GL = FR_DD, FR_HH, f0 // gl = d * h &SQRT&
1568 fma.s1 FR_Y3 = FR_Y2,FR_E4,FR_Y2 // y3 = y2+y2*e4 #DIV#
1573 fnma.s1 FR_R1 = FR_QQ,FR_X_Hi,FR_PP // r1 = a-b*x #DIV#
1579 fma.s1 FR_HH = FR_GG, FR_X_Hi, f0 // hh = gg * x_hi
1584 fma.s1 FR_LH = FR_GL, FR_X_Hi, f0 // lh = gl * x_hi
1590 fma.s1 FR_X_lo = FR_R1,FR_Y3,f0 // x_lo = r1*y3 #DIV#
1596 fma.s1 FR_LL = FR_GL, FR_X_lo, f0 // ll = gl*x_lo
1601 fma.s1 FR_HL = FR_GG, FR_X_lo, f0 // hl = gg * x_lo
1607 fms.s1 FR_Res = FR_GL, f1, FR_LL // res = gl + ll
1613 fms.s1 FR_Res = FR_Res, f1, FR_LH // res = res + lh
1619 fms.s1 FR_Res = FR_Res, f1, FR_HL // res = res + hl
1625 fms.s1 FR_Res = FR_Res, f1, FR_HH // res = res + hh
1631 fma.s0 FR_Res = FR_Res, f1, FR_GG // result = res + gg
1632 br.ret.sptk b0 // Exit for near 1 path
1634 // NEAR ONE INTERVAL END
1642 fmerge.s FR_Arg_X = FR_Arg, FR_Arg
1646 mov GR_Parameter_TAG = 135
1647 frcpa.s0 FR_Res,p0 = f0,f0 // get QNaN,and raise invalid
1648 br.cond.sptk __libm_error_region // exit if x < 1.0
1651 GLOBAL_LIBM_END(acoshl)
1652 libm_alias_ldouble_other (acosh, acosh)
1656 LOCAL_LIBM_ENTRY(__libm_error_region)
1659 add GR_Parameter_Y = -32,sp // Parameter 2 value
1661 .save ar.pfs,GR_SAVE_PFS
1662 mov GR_SAVE_PFS = ar.pfs // Save ar.pfs
1666 add sp = -64,sp // Create new stack
1668 mov GR_SAVE_GP = gp // Save gp
1672 stfe [GR_Parameter_Y] = FR_Arg_Y,16 // Parameter 2 to stack
1673 add GR_Parameter_X = 16,sp // Parameter 1 address
1675 mov GR_SAVE_B0 = b0 // Save b0
1680 stfe [GR_Parameter_X] = FR_Arg_X // Parameter 1 to stack
1681 add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
1685 stfe [GR_Parameter_Y] = FR_Res // Parameter 3 to stack
1686 add GR_Parameter_Y = -16,GR_Parameter_Y
1687 br.call.sptk b0 = __libm_error_support# // Error handling function
1693 add GR_Parameter_RESULT = 48,sp
1697 ldfe f8 = [GR_Parameter_RESULT] // Get return res
1699 add sp = 64,sp // Restore stack pointer
1700 mov b0 = GR_SAVE_B0 // Restore return address
1704 mov gp = GR_SAVE_GP // Restore gp
1705 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
1706 br.ret.sptk b0 // Return
1709 LOCAL_LIBM_END(__libm_error_region#)
1711 .type __libm_error_support#,@function
1712 .global __libm_error_support#