2.9
[glibc/nacl-glibc.git] / sysdeps / ia64 / fpu / e_atanhl.S
blobcee1ba17b1b6e7d475f45b1fabea96b702be1c94
1 .file "atanhl.s" 
4 // Copyright (c) 2001 - 2003, Intel Corporation
5 // All rights reserved.
6 //
7 // Contributed 2001 by the Intel Numerics Group, Intel Corporation
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
13 // * Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
16 // * Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
20 // * The name of Intel Corporation may not be used to endorse or promote
21 // products derived from this software without specific prior written
22 // permission.
24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT 
26 // LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 
28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,
29 // EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO,
30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR 
31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 
32 // OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING
33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
34 // SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
35 // 
36 // Intel Corporation is the author of this code,and requests that all
37 // problem reports or change requests be submitted to it directly at 
38 // http://www.intel.com/software/products/opensource/libraries/num.htm.
40 //*********************************************************************
42 // History: 
43 // 09/10/01  Initial version
44 // 12/11/01  Corrected .restore syntax
45 // 05/20/02  Cleaned up namespace and sf0 syntax
46 // 02/10/03  Reordered header: .section, .global, .proc, .align;
47 //           used data8 for long double table values
49 //*********************************************************************
51 //*********************************************************************
53 // Function: atanhl(x) computes the principle value of the inverse 
54 // hyperbolic tangent of x.
56 //*********************************************************************
58 // Resources Used:
60 //    Floating-Point Registers: f8 (Input and Return Value)
61 //                              f33-f73
63 //    General Purpose Registers:
64 //      r32-r52
65 //      r49-r52 (Used to pass arguments to error handling routine)
67 //    Predicate Registers:      p6-p15
69 //*********************************************************************
71 // IEEE Special Conditions:
73 //    atanhl(inf) = QNaN
74 //    atanhl(-inf) = QNaN 
75 //    atanhl(+/-0) = +/-0 
76 //    atanhl(1) =  +inf 
77 //    atanhl(-1) =  -inf 
78 //    atanhl(|x|>1) = QNaN
79 //    atanhl(SNaN) = QNaN
80 //    atanhl(QNaN) = QNaN
82 //*********************************************************************
84 // Overview
86 // The method consists of two cases.
88 // If      |x| < 1/32  use case atanhl_near_zero;
89 // else                 use case atanhl_regular;
91 // Case atanhl_near_zero:
93 //   atanhl(x) can be approximated by the Taylor series expansion
94 //   up to order 17.
96 // Case atanhl_regular:
98 //   Here we use formula atanhl(x) = sign(x)*log1pl(2*|x|/(1-|x|))/2 and
99 //   calculation is subdivided into two stages. The first stage is 
100 //   calculating of X = 2*|x|/(1-|x|). The second one is calculating of 
101 //   sign(x)*log1pl(X)/2. To obtain required accuracy we use precise division
102 //   algorythm output of which is a pair of two extended precision values those
103 //   approximate result of division with accuracy higher than working
104 //   precision. This pair is passed to modified log1pl function.
107 //   1. calculating of X = 2*|x|/(1-|x|)
108 //   ( based on Peter Markstein's "IA-64 and Elementary Functions" book )
109 //   ********************************************************************
111 //     a = 2*|x|
112 //     b = 1 - |x|
113 //     b_lo = |x| - (1 - b)
115 //     y = frcpa(b)         initial approximation of 1/b
116 //     q = a*y              initial approximation of a/b
117 //     
118 //     e = 1 - b*y
119 //     e2 = e + e^2
120 //     e1 = e^2
121 //     y1 = y + y*e2 = y + y*(e+e^2)
123 //     e3 = e + e1^2
124 //     y2 = y + y1*e3 = y + y*(e+e^2+..+e^6)
126 //     r = a - b*q
127 //     e = 1 - b*y2
128 //     X = q + r*y2         high part of a/b
130 //     y3 = y2 + y2*e4
131 //     r1 = a - b*X
132 //     r1 = r1 - b_lo*X
133 //     X_lo = r1*y3         low part of a/b
134 //  
135 //   2. special log1p algorithm overview
136 //   ***********************************
138 //    Here we use a table lookup method. The basic idea is that in
139 //    order to compute logl(Arg) = log1pl (Arg-1) for an argument Arg in [1,2), 
140 //    we construct a value G such that G*Arg is close to 1 and that
141 //    logl(1/G) is obtainable easily from a table of values calculated
142 //    beforehand. Thus
144 //      logl(Arg) = logl(1/G) + logl(G*Arg)
145 //           = logl(1/G) + logl(1 + (G*Arg - 1))
147 //    Because |G*Arg - 1| is small, the second term on the right hand
148 //    side can be approximated by a short polynomial. We elaborate
149 //    this method in several steps.
151 //    Step 0: Initialization
152 //    ------
153 //    We need to calculate logl(X + X_lo + 1). Obtain N, S_hi such that
155 //      X + X_lo + 1 = 2^N * ( S_hi + S_lo )   exactly
157 //    where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
158 //    that |S_lo| <= ulp(S_hi).
160 //    For the special version of log1p we add X_lo to S_lo (S_lo = S_lo + X_lo)
161 //    !-----------------------------------------------------------------------!
163 //    Step 1: Argument Reduction
164 //    ------
165 //    Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
167 //      G := G_1 * G_2 * G_3
168 //      r := (G * S_hi - 1) + G * S_lo
170 //    These G_j's have the property that the product is exactly 
171 //    representable and that |r| < 2^(-12) as a result.
173 //    Step 2: Approximation
174 //    ------
175 //    logl(1 + r) is approximated by a short polynomial poly(r).
177 //    Step 3: Reconstruction
178 //    ------
179 //    Finally, log1pl(X + X_lo) = logl(X + X_lo + 1) is given by
181 //    logl(X + X_lo + 1) =  logl(2^N * (S_hi + S_lo))
182 //                      ~=~ N*logl(2) + logl(1/G) + logl(1 + r)
183 //                      ~=~ N*logl(2) + logl(1/G) + poly(r).
185 //    For detailed description see log1p1 function, regular path.
187 //*********************************************************************
189 RODATA
190 .align 64
192 // ************* DO NOT CHANGE THE ORDER OF THESE TABLES *************
194 LOCAL_OBJECT_START(Constants_TaylorSeries)
195 data8  0xF0F0F0F0F0F0F0F1,0x00003FFA // C17
196 data8  0x8888888888888889,0x00003FFB // C15
197 data8  0x9D89D89D89D89D8A,0x00003FFB // C13
198 data8  0xBA2E8BA2E8BA2E8C,0x00003FFB // C11
199 data8  0xE38E38E38E38E38E,0x00003FFB // C9
200 data8  0x9249249249249249,0x00003FFC // C7
201 data8  0xCCCCCCCCCCCCCCCD,0x00003FFC // C5
202 data8  0xAAAAAAAAAAAAAAAA,0x00003FFD // C3
203 data4  0x3f000000                    // 1/2
204 data4  0x00000000                    // pad 
205 data4  0x00000000
206 data4  0x00000000
207 LOCAL_OBJECT_END(Constants_TaylorSeries)
209 LOCAL_OBJECT_START(Constants_Q)
210 data4  0x00000000,0xB1721800,0x00003FFE,0x00000000 // log2_hi
211 data4  0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000 // log2_lo
212 data4  0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000 // Q4
213 data4  0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000 // Q3
214 data4  0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000 // Q2
215 data4  0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000 // Q1
216 LOCAL_OBJECT_END(Constants_Q)
219 // Z1 - 16 bit fixed
220 LOCAL_OBJECT_START(Constants_Z_1)
221 data4  0x00008000
222 data4  0x00007879
223 data4  0x000071C8
224 data4  0x00006BCB
225 data4  0x00006667
226 data4  0x00006187
227 data4  0x00005D18
228 data4  0x0000590C
229 data4  0x00005556
230 data4  0x000051EC
231 data4  0x00004EC5
232 data4  0x00004BDB
233 data4  0x00004925
234 data4  0x0000469F
235 data4  0x00004445
236 data4  0x00004211
237 LOCAL_OBJECT_END(Constants_Z_1)
239 // G1 and H1 - IEEE single and h1 - IEEE double
240 LOCAL_OBJECT_START(Constants_G_H_h1)
241 data4  0x3F800000,0x00000000
242 data8  0x0000000000000000
243 data4  0x3F70F0F0,0x3D785196
244 data8  0x3DA163A6617D741C
245 data4  0x3F638E38,0x3DF13843
246 data8  0x3E2C55E6CBD3D5BB
247 data4  0x3F579430,0x3E2FF9A0
248 data8  0xBE3EB0BFD86EA5E7
249 data4  0x3F4CCCC8,0x3E647FD6
250 data8  0x3E2E6A8C86B12760
251 data4  0x3F430C30,0x3E8B3AE7
252 data8  0x3E47574C5C0739BA
253 data4  0x3F3A2E88,0x3EA30C68
254 data8  0x3E20E30F13E8AF2F
255 data4  0x3F321640,0x3EB9CEC8
256 data8  0xBE42885BF2C630BD
257 data4  0x3F2AAAA8,0x3ECF9927
258 data8  0x3E497F3497E577C6
259 data4  0x3F23D708,0x3EE47FC5
260 data8  0x3E3E6A6EA6B0A5AB
261 data4  0x3F1D89D8,0x3EF8947D
262 data8  0xBDF43E3CD328D9BE
263 data4  0x3F17B420,0x3F05F3A1
264 data8  0x3E4094C30ADB090A
265 data4  0x3F124920,0x3F0F4303
266 data8  0xBE28FBB2FC1FE510
267 data4  0x3F0D3DC8,0x3F183EBF
268 data8  0x3E3A789510FDE3FA
269 data4  0x3F088888,0x3F20EC80
270 data8  0x3E508CE57CC8C98F
271 data4  0x3F042108,0x3F29516A
272 data8  0xBE534874A223106C
273 LOCAL_OBJECT_END(Constants_G_H_h1)
275 // Z2 - 16 bit fixed
276 LOCAL_OBJECT_START(Constants_Z_2)
277 data4  0x00008000
278 data4  0x00007F81
279 data4  0x00007F02
280 data4  0x00007E85
281 data4  0x00007E08
282 data4  0x00007D8D
283 data4  0x00007D12
284 data4  0x00007C98
285 data4  0x00007C20
286 data4  0x00007BA8
287 data4  0x00007B31
288 data4  0x00007ABB
289 data4  0x00007A45
290 data4  0x000079D1
291 data4  0x0000795D
292 data4  0x000078EB
293 LOCAL_OBJECT_END(Constants_Z_2)
295 // G2 and H2 - IEEE single and h2 - IEEE double
296 LOCAL_OBJECT_START(Constants_G_H_h2)
297 data4  0x3F800000,0x00000000
298 data8  0x0000000000000000
299 data4  0x3F7F00F8,0x3B7F875D
300 data8  0x3DB5A11622C42273
301 data4  0x3F7E03F8,0x3BFF015B
302 data8  0x3DE620CF21F86ED3
303 data4  0x3F7D08E0,0x3C3EE393
304 data8  0xBDAFA07E484F34ED
305 data4  0x3F7C0FC0,0x3C7E0586
306 data8  0xBDFE07F03860BCF6
307 data4  0x3F7B1880,0x3C9E75D2
308 data8  0x3DEA370FA78093D6
309 data4  0x3F7A2328,0x3CBDC97A
310 data8  0x3DFF579172A753D0
311 data4  0x3F792FB0,0x3CDCFE47
312 data8  0x3DFEBE6CA7EF896B
313 data4  0x3F783E08,0x3CFC15D0
314 data8  0x3E0CF156409ECB43
315 data4  0x3F774E38,0x3D0D874D
316 data8  0xBE0B6F97FFEF71DF
317 data4  0x3F766038,0x3D1CF49B
318 data8  0xBE0804835D59EEE8
319 data4  0x3F757400,0x3D2C531D
320 data8  0x3E1F91E9A9192A74
321 data4  0x3F748988,0x3D3BA322
322 data8  0xBE139A06BF72A8CD
323 data4  0x3F73A0D0,0x3D4AE46F
324 data8  0x3E1D9202F8FBA6CF
325 data4  0x3F72B9D0,0x3D5A1756
326 data8  0xBE1DCCC4BA796223
327 data4  0x3F71D488,0x3D693B9D
328 data8  0xBE049391B6B7C239
329 LOCAL_OBJECT_END(Constants_G_H_h2)
331 // G3 and H3 - IEEE single and h3 - IEEE double 
332 LOCAL_OBJECT_START(Constants_G_H_h3)
333 data4  0x3F7FFC00,0x38800100
334 data8  0x3D355595562224CD
335 data4  0x3F7FF400,0x39400480
336 data8  0x3D8200A206136FF6
337 data4  0x3F7FEC00,0x39A00640
338 data8  0x3DA4D68DE8DE9AF0
339 data4  0x3F7FE400,0x39E00C41
340 data8  0xBD8B4291B10238DC
341 data4  0x3F7FDC00,0x3A100A21
342 data8  0xBD89CCB83B1952CA
343 data4  0x3F7FD400,0x3A300F22
344 data8  0xBDB107071DC46826
345 data4  0x3F7FCC08,0x3A4FF51C
346 data8  0x3DB6FCB9F43307DB
347 data4  0x3F7FC408,0x3A6FFC1D
348 data8  0xBD9B7C4762DC7872
349 data4  0x3F7FBC10,0x3A87F20B
350 data8  0xBDC3725E3F89154A
351 data4  0x3F7FB410,0x3A97F68B
352 data8  0xBD93519D62B9D392
353 data4  0x3F7FAC18,0x3AA7EB86
354 data8  0x3DC184410F21BD9D
355 data4  0x3F7FA420,0x3AB7E101
356 data8  0xBDA64B952245E0A6
357 data4  0x3F7F9C20,0x3AC7E701
358 data8  0x3DB4B0ECAABB34B8
359 data4  0x3F7F9428,0x3AD7DD7B
360 data8  0x3D9923376DC40A7E
361 data4  0x3F7F8C30,0x3AE7D474
362 data8  0x3DC6E17B4F2083D3
363 data4  0x3F7F8438,0x3AF7CBED
364 data8  0x3DAE314B811D4394
365 data4  0x3F7F7C40,0x3B03E1F3
366 data8  0xBDD46F21B08F2DB1
367 data4  0x3F7F7448,0x3B0BDE2F
368 data8  0xBDDC30A46D34522B
369 data4  0x3F7F6C50,0x3B13DAAA
370 data8  0x3DCB0070B1F473DB
371 data4  0x3F7F6458,0x3B1BD766
372 data8  0xBDD65DDC6AD282FD
373 data4  0x3F7F5C68,0x3B23CC5C
374 data8  0xBDCDAB83F153761A
375 data4  0x3F7F5470,0x3B2BC997
376 data8  0xBDDADA40341D0F8F
377 data4  0x3F7F4C78,0x3B33C711
378 data8  0x3DCD1BD7EBC394E8
379 data4  0x3F7F4488,0x3B3BBCC6
380 data8  0xBDC3532B52E3E695
381 data4  0x3F7F3C90,0x3B43BAC0
382 data8  0xBDA3961EE846B3DE
383 data4  0x3F7F34A0,0x3B4BB0F4
384 data8  0xBDDADF06785778D4
385 data4  0x3F7F2CA8,0x3B53AF6D
386 data8  0x3DCC3ED1E55CE212
387 data4  0x3F7F24B8,0x3B5BA620
388 data8  0xBDBA31039E382C15
389 data4  0x3F7F1CC8,0x3B639D12
390 data8  0x3D635A0B5C5AF197
391 data4  0x3F7F14D8,0x3B6B9444
392 data8  0xBDDCCB1971D34EFC
393 data4  0x3F7F0CE0,0x3B7393BC
394 data8  0x3DC7450252CD7ADA
395 data4  0x3F7F04F0,0x3B7B8B6D
396 data8  0xBDB68F177D7F2A42
397 LOCAL_OBJECT_END(Constants_G_H_h3)
401 // Floating Point Registers
403 FR_C17              = f50
404 FR_C15              = f51
405 FR_C13              = f52
406 FR_C11              = f53
407 FR_C9               = f54
408 FR_C7               = f55
409 FR_C5               = f56
410 FR_C3               = f57
411 FR_x2               = f58
412 FR_x3               = f59
413 FR_x4               = f60
414 FR_x8               = f61
416 FR_Rcp              = f61
418 FR_A                = f33
419 FR_R1               = f33
421 FR_E1               = f34
422 FR_E3               = f34
423 FR_Y2               = f34
424 FR_Y3               = f34
426 FR_E2               = f35
427 FR_Y1               = f35
429 FR_B                = f36
430 FR_Y0               = f37
431 FR_E0               = f38
432 FR_E4               = f39
433 FR_Q0               = f40
434 FR_R0               = f41
435 FR_B_lo             = f42
437 FR_abs_x            = f43
438 FR_Bp               = f44
439 FR_Bn               = f45
440 FR_Yp               = f46
441 FR_Yn               = f47
443 FR_X                = f48
444 FR_BB               = f48
445 FR_X_lo             = f49
447 FR_G                = f50
448 FR_Y_hi             = f51
449 FR_H                = f51
450 FR_h                = f52
451 FR_G2               = f53
452 FR_H2               = f54
453 FR_h2               = f55
454 FR_G3               = f56
455 FR_H3               = f57
456 FR_h3               = f58
458 FR_Q4               = f59
459 FR_poly_lo          = f59
460 FR_Y_lo             = f59
462 FR_Q3               = f60
463 FR_Q2               = f61
465 FR_Q1               = f62
466 FR_poly_hi          = f62
468 FR_float_N          = f63
470 FR_AA               = f64
471 FR_S_lo             = f64
473 FR_S_hi             = f65
474 FR_r                = f65
476 FR_log2_hi          = f66
477 FR_log2_lo          = f67
478 FR_Z                = f68
479 FR_2_to_minus_N     = f69
480 FR_rcub             = f70
481 FR_rsq              = f71
482 FR_05r              = f72
483 FR_Half             = f73
485 FR_Arg_X            = f50
486 FR_Arg_Y            = f0
487 FR_RESULT           = f8
491 // General Purpose Registers
493 GR_ad_05            = r33
494 GR_Index1           = r34
495 GR_ArgExp           = r34
496 GR_Index2           = r35
497 GR_ExpMask          = r35
498 GR_NearZeroBound    = r36
499 GR_signif           = r36
500 GR_X_0              = r37
501 GR_X_1              = r37
502 GR_X_2              = r38
503 GR_Index3           = r38
504 GR_minus_N          = r39
505 GR_Z_1              = r40
506 GR_Z_2              = r40
507 GR_N                = r41
508 GR_Bias             = r42
509 GR_M                = r43
510 GR_ad_taylor        = r44
511 GR_ad_taylor_2      = r45
512 GR_ad2_tbl_3        = r45
513 GR_ad_tbl_1         = r46
514 GR_ad_tbl_2         = r47
515 GR_ad_tbl_3         = r48
516 GR_ad_q             = r49
517 GR_ad_z_1           = r50
518 GR_ad_z_2           = r51
519 GR_ad_z_3           = r52
522 // Added for unwind support
524 GR_SAVE_PFS         = r46
525 GR_SAVE_B0          = r47
526 GR_SAVE_GP          = r48
527 GR_Parameter_X      = r49
528 GR_Parameter_Y      = r50
529 GR_Parameter_RESULT = r51
530 GR_Parameter_TAG    = r52
534 .section .text
535 GLOBAL_LIBM_ENTRY(atanhl)
537 { .mfi
538       alloc         r32 = ar.pfs,0,17,4,0
539       fnma.s1       FR_Bp = f8,f1,f1 // b = 1 - |arg| (for x>0)
540       mov           GR_ExpMask = 0x1ffff
541 }                    
542 { .mfi                
543       addl          GR_ad_taylor = @ltoff(Constants_TaylorSeries),gp
544       fma.s1        FR_Bn = f8,f1,f1 // b = 1 - |arg| (for x<0)
545       mov           GR_NearZeroBound = 0xfffa  // biased exp of 1/32
546 };;                    
547 { .mfi                
548       getf.exp      GR_ArgExp = f8
549       fcmp.lt.s1    p6,p7 = f8,f0 // is negative?
550       nop.i         0
551 }                    
552 { .mfi                
553       ld8           GR_ad_taylor = [GR_ad_taylor]
554       fmerge.s      FR_abs_x =  f1,f8
555       nop.i         0
556 };;                    
557 { .mfi                
558       nop.m         0
559       fclass.m      p8,p0 = f8,0x1C7 // is arg NaT,Q/SNaN or +/-0 ?
560       nop.i         0
562 { .mfi                
563       nop.m         0
564       fma.s1        FR_x2 = f8,f8,f0
565       nop.i         0
566 };;                    
567 { .mfi                
568       add           GR_ad_z_1 = 0x0F0,GR_ad_taylor
569       fclass.m      p9,p0 = f8,0x0a // is arg -denormal ?
570       add           GR_ad_taylor_2 = 0x010,GR_ad_taylor
571 }                    
572 { .mfi                
573       add           GR_ad_05 = 0x080,GR_ad_taylor
574       nop.f         0
575       nop.i         0
576 };;                    
577 { .mfi                
578       ldfe          FR_C17 = [GR_ad_taylor],32
579       fclass.m      p10,p0 = f8,0x09 // is arg +denormal ?
580       add           GR_ad_tbl_1 = 0x040,GR_ad_z_1 // point to Constants_G_H_h1
581 }                    
582 { .mfb                
583       add           GR_ad_z_2 = 0x140,GR_ad_z_1 // point to Constants_Z_2
584  (p8) fma.s0        f8 =  f8,f1,f0 // NaN or +/-0
585  (p8) br.ret.spnt   b0             // exit for Nan or +/-0
586 };;                    
587 { .mfi                
588       ldfe          FR_C15 = [GR_ad_taylor_2],32
589       fclass.m      p15,p0 = f8,0x23 // is +/-INF ?
590       add           GR_ad_tbl_2 = 0x180,GR_ad_z_1 // point to Constants_G_H_h2
591 }                    
592 { .mfb                
593       ldfe          FR_C13 = [GR_ad_taylor],32
594  (p9) fnma.s0       f8 =  f8,f8,f8 // -denormal
595  (p9) br.ret.spnt   b0             // exit for -denormal
596 };;                    
597 { .mfi                
598       ldfe          FR_C11 = [GR_ad_taylor_2],32
599       fcmp.eq.s0       p13,p0 = FR_abs_x,f1 // is |arg| = 1?
600       nop.i         0
601 }                    
602 { .mfb                
603       ldfe          FR_C9 = [GR_ad_taylor],32
604 (p10) fma.s0        f8 =  f8,f8,f8 // +denormal
605 (p10) br.ret.spnt   b0             // exit for +denormal
606 };;                    
607 { .mfi                
608       ldfe          FR_C7 = [GR_ad_taylor_2],32
609  (p6) frcpa.s1      FR_Yn,p11 = f1,FR_Bn // y = frcpa(b)
610       and           GR_ArgExp = GR_ArgExp,GR_ExpMask // biased exponent
611 }                    
612 { .mfb                
613       ldfe          FR_C5 = [GR_ad_taylor],32
614       fnma.s1       FR_B = FR_abs_x,f1,f1 // b = 1 - |arg|
615 (p15) br.cond.spnt  atanhl_gt_one // |arg| > 1
617 { .mfb
618       cmp.gt        p14,p0 = GR_NearZeroBound,GR_ArgExp
619  (p7) frcpa.s1      FR_Yp,p12 = f1,FR_Bp // y = frcpa(b)
620 (p13) br.cond.spnt  atanhl_eq_one // |arg| = 1/32
622 { .mfb
623       ldfe          FR_C3 = [GR_ad_taylor_2],32
624       fma.s1        FR_A = FR_abs_x,f1,FR_abs_x // a = 2 * |arg|
625 (p14) br.cond.spnt  atanhl_near_zero // |arg| < 1/32
627 { .mfi
628       nop.m         0
629       fcmp.gt.s0       p8,p0 = FR_abs_x,f1 // is |arg| > 1 ?
630       nop.i         0
632 .pred.rel "mutex",p6,p7
633 { .mfi
634       nop.m         0
635  (p6) fnma.s1       FR_B_lo = FR_Bn,f1,f1 // argt = 1 - (1 - |arg|)
636       nop.i         0
638 { .mfi
639       ldfs          FR_Half = [GR_ad_05]
640  (p7) fnma.s1       FR_B_lo = FR_Bp,f1,f1
641       nop.i         0
642 };;                    
643 { .mfi
644       nop.m         0
645  (p6) fnma.s1       FR_E0 = FR_Yn,FR_Bn,f1 // e = 1-b*y 
646       nop.i         0
647 }                    
648 { .mfb                
649       nop.m         0
650  (p6) fma.s1        FR_Y0 = FR_Yn,f1,f0
651  (p8) br.cond.spnt  atanhl_gt_one // |arg| > 1
653 { .mfi
654       nop.m         0
655  (p7) fnma.s1       FR_E0 = FR_Yp,FR_Bp,f1 
656       nop.i         0
658 { .mfi
659       nop.m         0
660  (p6) fma.s1        FR_Q0 = FR_A,FR_Yn,f0 // q = a*y
661       nop.i         0
663 { .mfi
664       nop.m         0
665  (p7) fma.s1        FR_Q0 = FR_A,FR_Yp,f0
666       nop.i         0
668 { .mfi
669       nop.m         0
670  (p7) fma.s1        FR_Y0 = FR_Yp,f1,f0
671       nop.i         0
673 { .mfi
674       nop.m         0
675       fclass.nm     p10,p0 = f8,0x1FF  // test for unsupported
676       nop.i         0
678 { .mfi
679       nop.m         0
680       fma.s1        FR_E2 = FR_E0,FR_E0,FR_E0 // e2 = e+e^2
681       nop.i         0
683 { .mfi
684       nop.m         0
685       fma.s1        FR_E1 = FR_E0,FR_E0,f0 // e1 = e^2
686       nop.i         0
688 { .mfb
689       nop.m         0
690 //    Return generated NaN or other value for unsupported values.
691 (p10) fma.s0        f8 = f8, f0, f0
692 (p10) br.ret.spnt   b0
694 { .mfi
695       nop.m         0
696       fma.s1        FR_Y1 = FR_Y0,FR_E2,FR_Y0 // y1 = y+y*e2
697       nop.i         0
699 { .mfi
700       nop.m         0
701       fma.s1        FR_E3 = FR_E1,FR_E1,FR_E0 // e3 = e+e1^2
702       nop.i         0
704 { .mfi
705       nop.m         0
706       fnma.s1       FR_B_lo = FR_abs_x,f1,FR_B_lo // b_lo = argt-|arg|
707       nop.i         0
709 { .mfi
710       nop.m         0
711       fma.s1        FR_Y2 = FR_Y1,FR_E3,FR_Y0 // y2 = y+y1*e3
712       nop.i         0
714 { .mfi
715       nop.m         0
716       fnma.s1       FR_R0 = FR_B,FR_Q0,FR_A // r = a-b*q
717       nop.i         0
719 { .mfi
720       nop.m         0
721       fnma.s1       FR_E4 = FR_B,FR_Y2,f1 // e4 = 1-b*y2
722       nop.i         0
724 { .mfi
725       nop.m         0
726       fma.s1        FR_X = FR_R0,FR_Y2,FR_Q0 // x = q+r*y2
727       nop.i         0
729 { .mfi
730       nop.m         0
731       fma.s1        FR_Z = FR_X,f1,f1 // x+1
732       nop.i         0
734 { .mfi
735       nop.m         0
736  (p6) fnma.s1       FR_Half = FR_Half,f1,f0 // sign(arg)/2
737       nop.i         0
739 { .mfi
740       nop.m         0
741       fma.s1        FR_Y3 = FR_Y2,FR_E4,FR_Y2 // y3 = y2+y2*e4
742       nop.i         0
744 { .mfi
745       nop.m         0
746       fnma.s1       FR_R1 = FR_B,FR_X,FR_A // r1 = a-b*x
747       nop.i         0
749 { .mfi
750       getf.sig      GR_signif = FR_Z // get significand of x+1
751       nop.f         0
752       nop.i         0
756 { .mfi
757       add           GR_ad_q = -0x060,GR_ad_z_1
758       nop.f         0
759       extr.u        GR_Index1 = GR_signif,59,4 // get high 4 bits of signif
761 { .mfi
762       add           GR_ad_tbl_3 = 0x280,GR_ad_z_1 // point to Constants_G_H_h3
763       nop.f         0
764       nop.i         0
766 { .mfi
767       shladd        GR_ad_z_1 = GR_Index1,2,GR_ad_z_1 // point to Z_1
768       nop.f         0
769       extr.u        GR_X_0 = GR_signif,49,15 // get high 15 bits of significand
771 { .mfi
772       ld4           GR_Z_1 = [GR_ad_z_1] // load Z_1
773       fmax.s1       FR_AA = FR_X,f1 // for S_lo,form AA = max(X,1.0)
774       nop.i         0
776 { .mfi
777       shladd        GR_ad_tbl_1 = GR_Index1,4,GR_ad_tbl_1 // point to G_1
778       nop.f         0
779       mov           GR_Bias = 0x0FFFF // exponent bias
781 { .mfi
782       ldfps         FR_G,FR_H = [GR_ad_tbl_1],8  // load G_1,H_1
783       fmerge.se     FR_S_hi =  f1,FR_Z // form |x+1|
784       nop.i         0
786 { .mfi
787       getf.exp      GR_N =  FR_Z // get N = exponent of x+1
788       nop.f         0
789       nop.i         0
791 { .mfi
792       ldfd          FR_h = [GR_ad_tbl_1] // load h_1
793       fnma.s1       FR_R1 = FR_B_lo,FR_X,FR_R1 // r1 = r1-b_lo*x
794       nop.i         0
796 { .mfi
797       ldfe          FR_log2_hi = [GR_ad_q],16 // load log2_hi
798       nop.f         0
799       pmpyshr2.u    GR_X_1 = GR_X_0,GR_Z_1,15 // get bits 30-15 of X_0 * Z_1
802 //    For performance,don't use result of pmpyshr2.u for 4 cycles.
804 { .mfi
805       ldfe          FR_log2_lo = [GR_ad_q],16 // load log2_lo
806       nop.f         0
807       sub           GR_N = GR_N,GR_Bias 
809 { .mfi
810       ldfe          FR_Q4 = [GR_ad_q],16  // load Q4
811       fms.s1        FR_S_lo = FR_AA,f1,FR_Z // form S_lo = AA - Z 
812       sub           GR_minus_N = GR_Bias,GR_N // form exponent of 2^(-N)
814 { .mmf
815       ldfe          FR_Q3 = [GR_ad_q],16 // load Q3
816       // put integer N into rightmost significand
817       setf.sig      FR_float_N = GR_N
818       fmin.s1       FR_BB = FR_X,f1 // for S_lo,form BB = min(X,1.0)
820 { .mfi
821       ldfe          FR_Q2 = [GR_ad_q],16 // load Q2
822       nop.f         0
823       extr.u        GR_Index2 = GR_X_1,6,4 // extract bits 6-9 of X_1 
825 { .mmi
826       ldfe          FR_Q1 = [GR_ad_q] // load Q1
827       shladd        GR_ad_z_2 = GR_Index2,2,GR_ad_z_2 // point to Z_2
828       nop.i         0
830 { .mmi
831       ld4           GR_Z_2 = [GR_ad_z_2] // load Z_2
832       shladd        GR_ad_tbl_2 = GR_Index2,4,GR_ad_tbl_2 // point to G_2
833       nop.i         0
835 { .mfi
836       ldfps         FR_G2,FR_H2 = [GR_ad_tbl_2],8 // load G_2,H_2
837       nop.f         0
838       nop.i         0
840 { .mfi
841       ldfd          FR_h2 = [GR_ad_tbl_2] // load h_2
842       fma.s1        FR_S_lo = FR_S_lo,f1,FR_BB // S_lo = S_lo + BB
843       nop.i         0
845 { .mfi
846       setf.exp      FR_2_to_minus_N = GR_minus_N // form 2^(-N)
847       fma.s1        FR_X_lo = FR_R1,FR_Y3,f0 // x_lo = r1*y3
848       nop.i         0
850 { .mfi
851       nop.m         0
852       nop.f         0
853       pmpyshr2.u    GR_X_2 = GR_X_1,GR_Z_2,15 // get bits 30-15 of X_1 * Z_2
856 //    For performance,don't use result of pmpyshr2.u for 4 cycles
858 { .mfi
859       add           GR_ad2_tbl_3 = 8,GR_ad_tbl_3
860       nop.f         0
861       nop.i         0
863 { .mfi
864       nop.m         0
865       nop.f         0 
866       nop.i         0
868 { .mfi
869       nop.m         0
870       nop.f         0 
871       nop.i         0
873 { .mfi
874       nop.m         0
875       nop.f         0 
876       nop.i         0
880 //    Now GR_X_2 can be used
882 { .mfi
883       nop.m         0
884       nop.f         0
885       extr.u        GR_Index3 = GR_X_2,1,5 // extract bits 1-5 of X_2
887 { .mfi
888       nop.m         0
889       fma.s1        FR_S_lo = FR_S_lo,f1,FR_X_lo // S_lo = S_lo + Arg_lo
890       nop.i         0
893 { .mfi
894       shladd        GR_ad_tbl_3 = GR_Index3,4,GR_ad_tbl_3 // point to G_3
895       fcvt.xf       FR_float_N = FR_float_N
896       nop.i         0
898 { .mfi
899       shladd        GR_ad2_tbl_3 = GR_Index3,4,GR_ad2_tbl_3 // point to h_3
900       fma.s1        FR_Q1 = FR_Q1,FR_Half,f0 // sign(arg)*Q1/2
901       nop.i         0
903 { .mmi
904       ldfps         FR_G3,FR_H3 = [GR_ad_tbl_3],8 // load G_3,H_3
905       ldfd          FR_h3 = [GR_ad2_tbl_3] // load h_3
906       nop.i         0
908 { .mfi
909       nop.m         0
910       fmpy.s1       FR_G = FR_G,FR_G2 // G = G_1 * G_2
911       nop.i         0
913 { .mfi
914       nop.m         0
915       fadd.s1       FR_H = FR_H,FR_H2 // H = H_1 + H_2
916       nop.i         0
918 { .mfi
919       nop.m         0
920       fadd.s1       FR_h = FR_h,FR_h2 // h = h_1 + h_2
921       nop.i         0
923 { .mfi
924       nop.m         0
925       // S_lo = S_lo * 2^(-N)
926       fma.s1        FR_S_lo = FR_S_lo,FR_2_to_minus_N,f0
927       nop.i         0
929 { .mfi
930       nop.m         0
931       fmpy.s1       FR_G = FR_G,FR_G3 // G = (G_1 * G_2) * G_3
932       nop.i         0
934 { .mfi
935       nop.m         0
936       fadd.s1       FR_H = FR_H,FR_H3 // H = (H_1 + H_2) + H_3
937       nop.i         0
939 { .mfi
940       nop.m         0
941       fadd.s1       FR_h = FR_h,FR_h3 // h = (h_1 + h_2) + h_3
942       nop.i         0
944 { .mfi
945       nop.m         0
946       fms.s1        FR_r = FR_G,FR_S_hi,f1 // r = G * S_hi - 1
947       nop.i         0
949 { .mfi
950       nop.m         0
951       // Y_hi = N * log2_hi + H
952       fma.s1        FR_Y_hi = FR_float_N,FR_log2_hi,FR_H
953       nop.i         0
955 { .mfi
956       nop.m         0
957       fma.s1        FR_h = FR_float_N,FR_log2_lo,FR_h // h = N * log2_lo + h
958       nop.i         0
960 { .mfi
961       nop.m         0
962       fma.s1        FR_r = FR_G,FR_S_lo,FR_r // r = G * S_lo + (G * S_hi - 1)
963       nop.i         0
965 { .mfi
966       nop.m         0
967       fma.s1        FR_poly_lo = FR_r,FR_Q4,FR_Q3 // poly_lo = r * Q4 + Q3
968       nop.i         0
970 { .mfi
971       nop.m         0
972       fmpy.s1       FR_rsq = FR_r,FR_r // rsq = r * r
973       nop.i         0
975 { .mfi
976       nop.m         0
977       fma.s1        FR_05r = FR_r,FR_Half,f0 // sign(arg)*r/2
978       nop.i         0
980 { .mfi
981       nop.m         0
982       // poly_lo = poly_lo * r + Q2
983       fma.s1        FR_poly_lo = FR_poly_lo,FR_r,FR_Q2
984       nop.i         0
986 { .mfi
987       nop.m         0
988       fma.s1        FR_rcub = FR_rsq,FR_r,f0 // rcub = r^3
989       nop.i         0
991 { .mfi
992       nop.m         0
993       // poly_hi = sing(arg)*(Q1*r^2 + r)/2
994       fma.s1        FR_poly_hi = FR_Q1,FR_rsq,FR_05r
995       nop.i         0
997 { .mfi
998       nop.m         0
999       // poly_lo = poly_lo*r^3 + h
1000       fma.s1        FR_poly_lo = FR_poly_lo,FR_rcub,FR_h
1001       nop.i         0
1003 { .mfi
1004       nop.m         0
1005       // Y_lo = poly_hi + poly_lo/2
1006       fma.s0        FR_Y_lo = FR_poly_lo,FR_Half,FR_poly_hi
1007       nop.i         0
1009 { .mfb
1010       nop.m         0
1011      // Result = arctanh(x) = Y_hi/2 + Y_lo
1012       fma.s0        f8 = FR_Y_hi,FR_Half,FR_Y_lo
1013       br.ret.sptk   b0
1016 // Taylor's series
1017 atanhl_near_zero:
1018 { .mfi
1019       nop.m         0
1020       fma.s1        FR_x3 = FR_x2,f8,f0
1021       nop.i         0
1023 { .mfi
1024       nop.m         0
1025       fma.s1        FR_x4 = FR_x2,FR_x2,f0
1026       nop.i         0
1028 { .mfi
1029       nop.m         0
1030       fma.s1        FR_C17 = FR_C17,FR_x2,FR_C15
1031       nop.i         0
1033 { .mfi
1034       nop.m         0
1035       fma.s1        FR_C13 = FR_C13,FR_x2,FR_C11
1036       nop.i         0
1038 { .mfi
1039       nop.m         0
1040       fma.s1        FR_C9 = FR_C9,FR_x2,FR_C7
1041       nop.i         0
1043 { .mfi
1044       nop.m         0
1045       fma.s1        FR_C5 = FR_C5,FR_x2,FR_C3
1046       nop.i         0
1048 { .mfi
1049       nop.m         0
1050       fma.s1        FR_x8 = FR_x4,FR_x4,f0
1051       nop.i         0
1053 { .mfi
1054       nop.m         0
1055       fma.s1        FR_C17 = FR_C17,FR_x4,FR_C13
1056       nop.i         0
1058 { .mfi
1059       nop.m         0
1060       fma.s1        FR_C9 = FR_C9,FR_x4,FR_C5
1061       nop.i         0
1063 { .mfi
1064       nop.m         0
1065       fma.s1        FR_C17 = FR_C17,FR_x8,FR_C9
1066       nop.i         0
1068 { .mfb
1069       nop.m         0
1070       fma.s0        f8 = FR_C17,FR_x3,f8
1071       br.ret.sptk   b0 
1074 atanhl_eq_one:
1075 { .mfi
1076       nop.m         0
1077       frcpa.s0      FR_Rcp,p0 = f1,f0 // get inf,and raise Z flag
1078       nop.i         0
1080 { .mfi
1081       nop.m         0
1082       fmerge.s      FR_Arg_X = f8, f8
1083       nop.i         0
1085 { .mfb
1086       mov           GR_Parameter_TAG = 130
1087       fmerge.s      FR_RESULT = f8,FR_Rcp // result is +-inf
1088       br.cond.sptk  __libm_error_region // exit if |x| = 1.0
1091 atanhl_gt_one:
1092 { .mfi
1093       nop.m         0
1094       fmerge.s      FR_Arg_X = f8, f8
1095       nop.i         0
1097 { .mfb
1098       mov           GR_Parameter_TAG = 129
1099       frcpa.s0      FR_RESULT,p0 = f0,f0 // get QNaN,and raise invalid
1100       br.cond.sptk  __libm_error_region // exit if |x| > 1.0
1103 GLOBAL_LIBM_END(atanhl)
1105 LOCAL_LIBM_ENTRY(__libm_error_region)
1106 .prologue
1107 { .mfi
1108         add   GR_Parameter_Y=-32,sp             // Parameter 2 value
1109         nop.f 0
1110 .save   ar.pfs,GR_SAVE_PFS
1111         mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
1113 { .mfi
1114 .fframe 64
1115         add sp=-64,sp                           // Create new stack
1116         nop.f 0
1117         mov GR_SAVE_GP=gp                       // Save gp
1119 { .mmi
1120         stfe [GR_Parameter_Y] = FR_Arg_Y,16     // Save Parameter 2 on stack
1121         add GR_Parameter_X = 16,sp              // Parameter 1 address
1122 .save   b0,GR_SAVE_B0
1123         mov GR_SAVE_B0=b0                       // Save b0
1125 .body
1126 { .mib
1127         stfe [GR_Parameter_X] = FR_Arg_X        // Store Parameter 1 on stack
1128         add   GR_Parameter_RESULT = 0,GR_Parameter_Y
1129         nop.b 0                                 // Parameter 3 address
1131 { .mib
1132         stfe [GR_Parameter_Y] = FR_RESULT       // Store Parameter 3 on stack
1133         add   GR_Parameter_Y = -16,GR_Parameter_Y
1134         br.call.sptk b0=__libm_error_support#  // Call error handling function
1136 { .mmi
1137         nop.m 0
1138         nop.m 0
1139         add   GR_Parameter_RESULT = 48,sp
1141 { .mmi
1142         ldfe  f8 = [GR_Parameter_RESULT]       // Get return result off stack
1143 .restore sp
1144         add   sp = 64,sp                       // Restore stack pointer
1145         mov   b0 = GR_SAVE_B0                  // Restore return address
1147 { .mib
1148         mov   gp = GR_SAVE_GP                  // Restore gp
1149         mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
1150         br.ret.sptk     b0                     // Return
1153 LOCAL_LIBM_END(__libm_error_region#)
1155 .type   __libm_error_support#,@function
1156 .global __libm_error_support#