localedata: dz_BT, bo_CN: convert to UTF-8
[glibc.git] / sysdeps / ia64 / fpu / e_coshl.S
blob144ce1d6f3c3890f966f2ce0d7caf867f17f884b
1 .file "coshl.s"
4 // Copyright (c) 2000 - 2002, Intel Corporation
5 // All rights reserved.
6 //
7 //
8 // Redistribution and use in source and binary forms, with or without
9 // modification, are permitted provided that the following conditions are
10 // met:
12 // * Redistributions of source code must retain the above copyright
13 // notice, this list of conditions and the following disclaimer.
15 // * Redistributions in binary form must reproduce the above copyright
16 // notice, this list of conditions and the following disclaimer in the
17 // documentation and/or other materials provided with the distribution.
19 // * The name of Intel Corporation may not be used to endorse or promote
20 // products derived from this software without specific prior written
21 // permission.
23 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 // Intel Corporation is the author of this code, and requests that all
36 // problem reports or change requests be submitted to it directly at
37 // http://www.intel.com/software/products/opensource/libraries/num.htm.
39 // History
40 //==============================================================
41 // 02/02/00 Initial version
42 // 04/04/00 Unwind support added
43 // 08/15/00 Bundle added after call to __libm_error_support to properly
44 //          set [the previously overwritten] GR_Parameter_RESULT.
45 // 01/23/01 Set inexact flag for large args.
46 // 05/07/01 Reworked to improve speed of all paths
47 // 05/20/02 Cleaned up namespace and sf0 syntax
48 // 12/06/02 Improved performance
50 // API
51 //==============================================================
52 // long double = coshl(long double)
53 // input  floating point f8
54 // output floating point f8
56 // Registers used
57 //==============================================================
58 // general registers:
59 // r14 -> r40
60 // predicate registers used:
61 // p6 -> p11
62 // floating-point registers used:
63 // f9 -> f15; f32 -> f90;
64 // f8 has input, then output
66 // Overview of operation
67 //==============================================================
68 // There are seven paths
69 // 1. 0 < |x| < 0.25          COSH_BY_POLY
70 // 2. 0.25 <=|x| < 32         COSH_BY_TBL
71 // 3. 32 <= |x| < 11357.21655 COSH_BY_EXP (merged path with COSH_BY_TBL)
72 // 4. |x| >= 11357.21655      COSH_HUGE
73 // 5. x=0                     Done with early exit
74 // 6. x=inf,nan               Done with early exit
75 // 7. x=denormal              COSH_DENORM
77 // For double extended we get overflow for x >= 400c b174 ddc0 31ae c0ea
78 //                                           >= 11357.21655
81 // 1. COSH_BY_POLY   0 < |x| < 0.25
82 // ===============
83 // Evaluate cosh(x) by a 12th order polynomial
84 // Care is take for the order of multiplication; and P2 is not exactly 1/4!,
85 // P3 is not exactly 1/6!, etc.
86 // cosh(x) = 1 + (P1*x^2 + P2*x^4 + P3*x^6 + P4*x^8 + P5*x^10 + P6*x^12)
88 // 2. COSH_BY_TBL   0.25 <= |x| < 32.0
89 // =============
90 // cosh(x) = cosh(B+R)
91 //         = cosh(B)cosh(R) + sinh(B)sinh(R)
93 // ax = |x| = M*log2/64 + R
94 // B = M*log2/64
95 // M = 64*N + j
96 //   We will calculate M and get N as (M-j)/64
97 //   The division is a shift.
98 // exp(B)  = exp(N*log2 + j*log2/64)
99 //         = 2^N * 2^(j*log2/64)
100 // cosh(B) = 1/2(e^B + e^-B)
101 //         = 1/2(2^N * 2^(j*log2/64) + 2^-N * 2^(-j*log2/64))
102 // cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64))
103 // sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64))
104 // 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32
105 // Tjhi is double-extended (80-bit) and Tjlo is single(32-bit)
107 // R = ax - M*log2/64
108 // R = ax - M*log2_by_64_hi - M*log2_by_64_lo
109 // exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...)
110 //        = 1 + p_odd + p_even
111 //        where the p_even uses the A coefficients and the p_even uses
112 //        the B coefficients
114 // So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd
115 //    cosh(R) = 1 + p_even
116 //    cosh(B) = C_hi + C_lo
117 //    sinh(B) = S_hi
118 // cosh(x) = cosh(B)cosh(R) + sinh(B)sinh(R)
120 // 3. COSH_BY_EXP   32.0 <= |x| < 11357.21655  ( 400c b174 ddc0 31ae c0ea )
121 // ==============
122 // Can approximate result by exp(x)/2 in this region.
123 // Y_hi = Tjhi
124 // Y_lo = Tjhi * (p_odd + p_even) + Tjlo
125 // cosh(x) = Y_hi + Y_lo
127 // 4. COSH_HUGE     |x| >= 11357.21655  ( 400c b174 ddc0 31ae c0ea )
128 // ============
129 // Set error tag and call error support
132 // Assembly macros
133 //==============================================================
134 r_ad5                 = r14
135 r_rshf_2to57          = r15
136 r_exp_denorm          = r15
137 r_ad_mJ_lo            = r15
138 r_ad_J_lo             = r16
139 r_2Nm1                = r17
140 r_2mNm1               = r18
141 r_exp_x               = r18
142 r_ad_J_hi             = r19
143 r_ad2o                = r19
144 r_ad_mJ_hi            = r20
145 r_mj                  = r21
146 r_ad2e                = r22
147 r_ad3                 = r23
148 r_ad1                 = r24
149 r_Mmj                 = r24
150 r_rshf                = r25
151 r_M                   = r25
152 r_N                   = r25
153 r_jshf                = r26
154 r_exp_2tom57          = r26
155 r_j                   = r26
156 r_exp_mask            = r27
157 r_signexp_x           = r28
158 r_signexp_0_5         = r28
159 r_exp_0_25            = r29
160 r_sig_inv_ln2         = r30
161 r_exp_32              = r30
162 r_exp_huge            = r30
163 r_ad4                 = r31
165 GR_SAVE_PFS           = r34
166 GR_SAVE_B0            = r35
167 GR_SAVE_GP            = r36
169 GR_Parameter_X        = r37
170 GR_Parameter_Y        = r38
171 GR_Parameter_RESULT   = r39
172 GR_Parameter_TAG      = r40
175 f_ABS_X               = f9
176 f_X2                  = f10
177 f_X4                  = f11
178 f_tmp                 = f14
179 f_RSHF                = f15
181 f_Inv_log2by64        = f32
182 f_log2by64_lo         = f33
183 f_log2by64_hi         = f34
184 f_A1                  = f35
186 f_A2                  = f36
187 f_A3                  = f37
188 f_Rcub                = f38
189 f_M_temp              = f39
190 f_R_temp              = f40
192 f_Rsq                 = f41
193 f_R                   = f42
194 f_M                   = f43
195 f_B1                  = f44
196 f_B2                  = f45
198 f_B3                  = f46
199 f_peven_temp1         = f47
200 f_peven_temp2         = f48
201 f_peven               = f49
202 f_podd_temp1          = f50
204 f_podd_temp2          = f51
205 f_podd                = f52
206 f_poly65              = f53
207 f_poly6543            = f53
208 f_poly6to1            = f53
209 f_poly43              = f54
210 f_poly21              = f55
212 f_X3                  = f56
213 f_INV_LN2_2TO63       = f57
214 f_RSHF_2TO57          = f58
215 f_2TOM57              = f59
216 f_smlst_oflow_input   = f60
218 f_pre_result          = f61
219 f_huge                = f62
220 f_spos                = f63
221 f_sneg                = f64
222 f_Tjhi                = f65
224 f_Tjlo                = f66
225 f_Tmjhi               = f67
226 f_Tmjlo               = f68
227 f_S_hi                = f69
228 f_SC_hi_temp          = f70
230 f_C_lo_temp1          = f71
231 f_C_lo_temp2          = f72
232 f_C_lo_temp3          = f73
233 f_C_lo_temp4          = f73
234 f_C_lo                = f74
235 f_C_hi                = f75
237 f_Y_hi                = f77
238 f_Y_lo_temp           = f78
239 f_Y_lo                = f79
240 f_NORM_X              = f80
242 f_P1                  = f81
243 f_P2                  = f82
244 f_P3                  = f83
245 f_P4                  = f84
246 f_P5                  = f85
248 f_P6                  = f86
249 f_Tjhi_spos           = f87
250 f_Tjlo_spos           = f88
251 f_huge                = f89
252 f_signed_hi_lo        = f90
255 // Data tables
256 //==============================================================
258 // DO NOT CHANGE ORDER OF THESE TABLES
259 RODATA
261 .align 16
262 LOCAL_OBJECT_START(cosh_arg_reduction)
263 //   data8 0xB8AA3B295C17F0BC, 0x00004005  // 64/log2 -- signif loaded with setf
264    data8 0xB17217F7D1000000, 0x00003FF8  // log2/64 high part
265    data8 0xCF79ABC9E3B39804, 0x00003FD0  // log2/64 low part
266    data8 0xb174ddc031aec0ea, 0x0000400c  // Smallest x to overflow (11357.21655)
267 LOCAL_OBJECT_END(cosh_arg_reduction)
269 LOCAL_OBJECT_START(cosh_p_table)
270    data8 0x8FA02AC65BCBD5BC, 0x00003FE2  // P6
271    data8 0xD00D00D1021D7370, 0x00003FEF  // P4
272    data8 0xAAAAAAAAAAAAAB80, 0x00003FFA  // P2
273    data8 0x93F27740C0C2F1CC, 0x00003FE9  // P5
274    data8 0xB60B60B60B4FE884, 0x00003FF5  // P3
275    data8 0x8000000000000000, 0x00003FFE  // P1
276 LOCAL_OBJECT_END(cosh_p_table)
278 LOCAL_OBJECT_START(cosh_ab_table)
279    data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC  // A1
280    data8 0x88888888884ECDD5, 0x00003FF8  // A2
281    data8 0xD00D0C6DCC26A86B, 0x00003FF2  // A3
282    data8 0x8000000000000002, 0x00003FFE  // B1
283    data8 0xAAAAAAAAAA402C77, 0x00003FFA  // B2
284    data8 0xB60B6CC96BDB144D, 0x00003FF5  // B3
285 LOCAL_OBJECT_END(cosh_ab_table)
287 LOCAL_OBJECT_START(cosh_j_hi_table)
288    data8 0xB504F333F9DE6484, 0x00003FFE
289    data8 0xB6FD91E328D17791, 0x00003FFE
290    data8 0xB8FBAF4762FB9EE9, 0x00003FFE
291    data8 0xBAFF5AB2133E45FB, 0x00003FFE
292    data8 0xBD08A39F580C36BF, 0x00003FFE
293    data8 0xBF1799B67A731083, 0x00003FFE
294    data8 0xC12C4CCA66709456, 0x00003FFE
295    data8 0xC346CCDA24976407, 0x00003FFE
296    data8 0xC5672A115506DADD, 0x00003FFE
297    data8 0xC78D74C8ABB9B15D, 0x00003FFE
298    data8 0xC9B9BD866E2F27A3, 0x00003FFE
299    data8 0xCBEC14FEF2727C5D, 0x00003FFE
300    data8 0xCE248C151F8480E4, 0x00003FFE
301    data8 0xD06333DAEF2B2595, 0x00003FFE
302    data8 0xD2A81D91F12AE45A, 0x00003FFE
303    data8 0xD4F35AABCFEDFA1F, 0x00003FFE
304    data8 0xD744FCCAD69D6AF4, 0x00003FFE
305    data8 0xD99D15C278AFD7B6, 0x00003FFE
306    data8 0xDBFBB797DAF23755, 0x00003FFE
307    data8 0xDE60F4825E0E9124, 0x00003FFE
308    data8 0xE0CCDEEC2A94E111, 0x00003FFE
309    data8 0xE33F8972BE8A5A51, 0x00003FFE
310    data8 0xE5B906E77C8348A8, 0x00003FFE
311    data8 0xE8396A503C4BDC68, 0x00003FFE
312    data8 0xEAC0C6E7DD24392F, 0x00003FFE
313    data8 0xED4F301ED9942B84, 0x00003FFE
314    data8 0xEFE4B99BDCDAF5CB, 0x00003FFE
315    data8 0xF281773C59FFB13A, 0x00003FFE
316    data8 0xF5257D152486CC2C, 0x00003FFE
317    data8 0xF7D0DF730AD13BB9, 0x00003FFE
318    data8 0xFA83B2DB722A033A, 0x00003FFE
319    data8 0xFD3E0C0CF486C175, 0x00003FFE
320    data8 0x8000000000000000, 0x00003FFF // Center of table
321    data8 0x8164D1F3BC030773, 0x00003FFF
322    data8 0x82CD8698AC2BA1D7, 0x00003FFF
323    data8 0x843A28C3ACDE4046, 0x00003FFF
324    data8 0x85AAC367CC487B15, 0x00003FFF
325    data8 0x871F61969E8D1010, 0x00003FFF
326    data8 0x88980E8092DA8527, 0x00003FFF
327    data8 0x8A14D575496EFD9A, 0x00003FFF
328    data8 0x8B95C1E3EA8BD6E7, 0x00003FFF
329    data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF
330    data8 0x8EA4398B45CD53C0, 0x00003FFF
331    data8 0x9031DC431466B1DC, 0x00003FFF
332    data8 0x91C3D373AB11C336, 0x00003FFF
333    data8 0x935A2B2F13E6E92C, 0x00003FFF
334    data8 0x94F4EFA8FEF70961, 0x00003FFF
335    data8 0x96942D3720185A00, 0x00003FFF
336    data8 0x9837F0518DB8A96F, 0x00003FFF
337    data8 0x99E0459320B7FA65, 0x00003FFF
338    data8 0x9B8D39B9D54E5539, 0x00003FFF
339    data8 0x9D3ED9A72CFFB751, 0x00003FFF
340    data8 0x9EF5326091A111AE, 0x00003FFF
341    data8 0xA0B0510FB9714FC2, 0x00003FFF
342    data8 0xA27043030C496819, 0x00003FFF
343    data8 0xA43515AE09E6809E, 0x00003FFF
344    data8 0xA5FED6A9B15138EA, 0x00003FFF
345    data8 0xA7CD93B4E965356A, 0x00003FFF
346    data8 0xA9A15AB4EA7C0EF8, 0x00003FFF
347    data8 0xAB7A39B5A93ED337, 0x00003FFF
348    data8 0xAD583EEA42A14AC6, 0x00003FFF
349    data8 0xAF3B78AD690A4375, 0x00003FFF
350    data8 0xB123F581D2AC2590, 0x00003FFF
351    data8 0xB311C412A9112489, 0x00003FFF
352    data8 0xB504F333F9DE6484, 0x00003FFF
353 LOCAL_OBJECT_END(cosh_j_hi_table)
355 LOCAL_OBJECT_START(cosh_j_lo_table)
356    data4 0x1EB2FB13
357    data4 0x1CE2CBE2
358    data4 0x1DDC3CBC
359    data4 0x1EE9AA34
360    data4 0x9EAEFDC1
361    data4 0x9DBF517B
362    data4 0x1EF88AFB
363    data4 0x1E03B216
364    data4 0x1E78AB43
365    data4 0x9E7B1747
366    data4 0x9EFE3C0E
367    data4 0x9D36F837
368    data4 0x9DEE53E4
369    data4 0x9E24AE8E
370    data4 0x1D912473
371    data4 0x1EB243BE
372    data4 0x1E669A2F
373    data4 0x9BBC610A
374    data4 0x1E761035
375    data4 0x9E0BE175
376    data4 0x1CCB12A1
377    data4 0x1D1BFE90
378    data4 0x1DF2F47A
379    data4 0x1EF22F22
380    data4 0x9E3F4A29
381    data4 0x1EC01A5B
382    data4 0x1E8CAC3A
383    data4 0x9DBB3FAB
384    data4 0x1EF73A19
385    data4 0x9BB795B5
386    data4 0x1EF84B76
387    data4 0x9EF5818B
388    data4 0x00000000 // Center of table
389    data4 0x1F77CACA
390    data4 0x1EF8A91D
391    data4 0x1E57C976
392    data4 0x9EE8DA92
393    data4 0x1EE85C9F
394    data4 0x1F3BF1AF
395    data4 0x1D80CA1E
396    data4 0x9D0373AF
397    data4 0x9F167097
398    data4 0x1EB70051
399    data4 0x1F6EB029
400    data4 0x1DFD6D8E
401    data4 0x9EB319B0
402    data4 0x1EBA2BEB
403    data4 0x1F11D537
404    data4 0x1F0D5A46
405    data4 0x9E5E7BCA
406    data4 0x9F3AAFD1
407    data4 0x9E86DACC
408    data4 0x9F3EDDC2
409    data4 0x1E496E3D
410    data4 0x9F490BF6
411    data4 0x1DD1DB48
412    data4 0x1E65EBFB
413    data4 0x9F427496
414    data4 0x1F283C4A
415    data4 0x1F4B0047
416    data4 0x1F130152
417    data4 0x9E8367C0
418    data4 0x9F705F90
419    data4 0x1EFB3C53
420    data4 0x1F32FB13
421 LOCAL_OBJECT_END(cosh_j_lo_table)
424 .section .text
425 GLOBAL_IEEE754_ENTRY(coshl)
427 { .mlx
428       getf.exp        r_signexp_x = f8   // Get signexp of x, must redo if unorm
429       movl            r_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
431 { .mlx
432       addl            r_ad1 = @ltoff(cosh_arg_reduction), gp
433       movl            r_rshf_2to57 = 0x4778000000000000 // 1.10000 2^(63+57)
437 { .mfi
438       ld8             r_ad1 = [r_ad1]
439       fmerge.s        f_ABS_X    = f0,f8
440       mov             r_exp_0_25 = 0x0fffd    // Form exponent for 0.25
442 { .mfi
443       nop.m           0
444       fnorm.s1        f_NORM_X = f8
445       mov             r_exp_2tom57 = 0xffff-57
449 { .mfi
450       setf.d          f_RSHF_2TO57 = r_rshf_2to57 // Form const 1.100 * 2^120
451       fclass.m        p10,p0 = f8, 0x0b           // Test for denorm
452       mov             r_exp_mask = 0x1ffff
454 { .mlx
455       setf.sig        f_INV_LN2_2TO63 = r_sig_inv_ln2 // Form 1/ln2 * 2^63
456       movl            r_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift
460 { .mfi
461       nop.m           0
462       fclass.m        p7,p0 = f8, 0x07  // Test if x=0
463       nop.i           0
465 { .mfi
466       setf.exp        f_2TOM57 = r_exp_2tom57 // Form 2^-57 for scaling
467       nop.f           0
468       add             r_ad3 = 0x90, r_ad1  // Point to ab_table
472 { .mfi
473       setf.d          f_RSHF = r_rshf     // Form right shift const 1.100 * 2^63
474       fclass.m        p6,p0 = f8, 0xe3     // Test if x nan, inf
475       add             r_ad4 = 0x2f0, r_ad1 // Point to j_hi_table midpoint
477 { .mib
478       add             r_ad2e = 0x20, r_ad1 // Point to p_table
479       nop.i           0
480 (p10) br.cond.spnt    COSH_DENORM          // Branch if x denorm
484 // Common path -- return here from COSH_DENORM if x is unnorm
485 COSH_COMMON:
486 { .mfi
487       ldfe            f_smlst_oflow_input = [r_ad2e],16
488 (p7)  fma.s0          f8 = f1, f1, f0      // Result = 1.0 if x=0
489       add             r_ad5 = 0x580, r_ad1 // Point to j_lo_table midpoint
491 { .mib
492       ldfe            f_log2by64_hi  = [r_ad1],16
493       and             r_exp_x = r_exp_mask, r_signexp_x
494 (p7)  br.ret.spnt     b0                  // Exit if x=0
498 // Get the A coefficients for COSH_BY_TBL
499 { .mfi
500       ldfe            f_A1 = [r_ad3],16
501       fcmp.lt.s1      p8,p9 = f8,f0           // Test for x<0
502       cmp.lt          p7,p0 = r_exp_x, r_exp_0_25  // Test x < 0.25
504 { .mfb
505       add             r_ad2o = 0x30, r_ad2e  // Point to p_table odd coeffs
506 (p6)  fma.s0          f8 = f8,f8,f0          // Result for x nan, inf
507 (p6)  br.ret.spnt     b0                     // Exit for x nan, inf
511 // Calculate X2 = ax*ax for COSH_BY_POLY
512 { .mfi
513       ldfe            f_log2by64_lo  = [r_ad1],16
514       nop.f           0
515       nop.i           0
517 { .mfb
518       ldfe            f_A2 = [r_ad3],16
519       fma.s1          f_X2 = f_NORM_X, f_NORM_X, f0
520 (p7)  br.cond.spnt    COSH_BY_POLY
524 // Here if |x| >= 0.25
525 COSH_BY_TBL:
526 // ******************************************************
527 // STEP 1 (TBL and EXP) - Argument reduction
528 // ******************************************************
529 // Get the following constants.
530 // Inv_log2by64
531 // log2by64_hi
532 // log2by64_lo
535 // We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and
536 // put them in an exponent.
537 // f_spos = 2^(N-1) and f_sneg = 2^(-N-1)
538 // 0xffff + (N-1)  = 0xffff +N -1
539 // 0xffff - (N +1) = 0xffff -N -1
542 // Calculate M and keep it as integer and floating point.
543 // M = round-to-integer(x*Inv_log2by64)
544 // f_M = M = truncate(ax/(log2/64))
545 // Put the integer representation of M in r_M
546 //    and the floating point representation of M in f_M
548 // Get the remaining A,B coefficients
549 { .mmi
550       ldfe            f_A3 = [r_ad3],16
551       nop.m           0
552       nop.i           0
556 // Use constant (1.100*2^(63-6)) to get rounded M into rightmost significand
557 // |x| * 64 * 1/ln2 * 2^(63-6) + 1.1000 * 2^(63+(63-6))
558 { .mfi
559       nop.m           0
560       fma.s1          f_M_temp = f_ABS_X, f_INV_LN2_2TO63, f_RSHF_2TO57
561       mov             r_signexp_0_5 = 0x0fffe // signexp of +0.5
565 // Test for |x| >= overflow limit
566 { .mfi
567       ldfe            f_B1 = [r_ad3],16
568       fcmp.ge.s1      p6,p0 = f_ABS_X, f_smlst_oflow_input
569       nop.i           0
573 { .mfi
574       ldfe            f_B2 = [r_ad3],16
575       nop.f           0
576       mov             r_exp_32 = 0x10004
580 // Subtract RSHF constant to get rounded M as a floating point value
581 // M_temp * 2^(63-6) - 2^63
582 { .mfb
583       ldfe            f_B3 = [r_ad3],16
584       fms.s1          f_M = f_M_temp, f_2TOM57, f_RSHF
585 (p6)  br.cond.spnt    COSH_HUGE  // Branch if result will overflow
589 { .mfi
590       getf.sig        r_M = f_M_temp
591       nop.f           0
592       cmp.ge          p7,p6 = r_exp_x, r_exp_32 // Test if x >= 32
596 // Calculate j. j is the signed extension of the six lsb of M. It
597 // has a range of -32 thru 31.
599 // Calculate R
600 // ax - M*log2by64_hi
601 // R = (ax - M*log2by64_hi) - M*log2by64_lo
603 { .mfi
604       nop.m           0
605       fnma.s1         f_R_temp = f_M, f_log2by64_hi, f_ABS_X
606       and             r_j = 0x3f, r_M
610 { .mii
611       nop.m           0
612       shl             r_jshf = r_j, 0x2 // Shift j so can sign extend it
614       sxt1            r_jshf = r_jshf
618 { .mii
619       nop.m           0
620       shr             r_j = r_jshf, 0x2    // Now j has range -32 to 31
621       nop.i           0
625 { .mmi
626       shladd          r_ad_J_hi = r_j, 4, r_ad4 // pointer to Tjhi
627       sub             r_Mmj = r_M, r_j          // M-j
628       sub             r_mj = r0, r_j            // Form -j
632 // The TBL and EXP branches are merged and predicated
633 // If TBL, p6 true, 0.25 <= |x| < 32
634 // If EXP, p7 true, 32 <= |x| < overflow_limit
636 // N = (M-j)/64
637 { .mfi
638       ldfe            f_Tjhi = [r_ad_J_hi]
639       fnma.s1         f_R = f_M, f_log2by64_lo, f_R_temp
640       shr             r_N = r_Mmj, 0x6            // N = (M-j)/64
642 { .mfi
643       shladd          r_ad_mJ_hi = r_mj, 4, r_ad4 // pointer to Tmjhi
644       nop.f           0
645       shladd          r_ad_mJ_lo = r_mj, 2, r_ad5 // pointer to Tmjlo
649 { .mfi
650       sub             r_2mNm1 = r_signexp_0_5, r_N // signexp 2^(-N-1)
651       nop.f           0
652       shladd          r_ad_J_lo = r_j, 2, r_ad5   // pointer to Tjlo
654 { .mfi
655       ldfe            f_Tmjhi = [r_ad_mJ_hi]
656       nop.f           0
657       add             r_2Nm1 = r_signexp_0_5, r_N // signexp 2^(N-1)
661 { .mmf
662       ldfs            f_Tmjlo = [r_ad_mJ_lo]
663       setf.exp        f_sneg = r_2mNm1            // Form 2^(-N-1)
664       nop.f           0
668 { .mmf
669       ldfs            f_Tjlo  = [r_ad_J_lo]
670       setf.exp        f_spos = r_2Nm1             // Form 2^(N-1)
671       nop.f           0
675 // ******************************************************
676 // STEP 2 (TBL and EXP)
677 // ******************************************************
678 // Calculate Rsquared and Rcubed in preparation for p_even and p_odd
680 { .mmf
681       nop.m           0
682       nop.m           0
683       fma.s1          f_Rsq  = f_R, f_R, f0
688 // Calculate p_even
689 // B_2 + Rsq *B_3
690 // B_1 + Rsq * (B_2 + Rsq *B_3)
691 // p_even = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3))
692 { .mfi
693       nop.m           0
694       fma.s1          f_peven_temp1 = f_Rsq, f_B3, f_B2
695       nop.i           0
697 // Calculate p_odd
698 // A_2 + Rsq *A_3
699 // A_1 + Rsq * (A_2 + Rsq *A_3)
700 // podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3))
701 { .mfi
702       nop.m           0
703       fma.s1          f_podd_temp1 = f_Rsq, f_A3, f_A2
704       nop.i           0
708 { .mfi
709       nop.m           0
710       fma.s1          f_Rcub = f_Rsq, f_R, f0
711       nop.i           0
716 // If TBL,
717 // Calculate S_hi and S_lo, and C_hi
718 // SC_hi_temp = sneg * Tmjhi
719 // S_hi = spos * Tjhi - SC_hi_temp
720 // S_hi = spos * Tjhi - (sneg * Tmjhi)
721 // C_hi = spos * Tjhi + SC_hi_temp
722 // C_hi = spos * Tjhi + (sneg * Tmjhi)
724 { .mfi
725       nop.m           0
726 (p6)  fma.s1          f_SC_hi_temp = f_sneg, f_Tmjhi, f0
727       nop.i           0
731 // If TBL,
732 // C_lo_temp3 = sneg * Tmjlo
733 // C_lo_temp4 = spos * Tjlo + C_lo_temp3
734 // C_lo_temp4 = spos * Tjlo + (sneg * Tmjlo)
735 { .mfi
736       nop.m           0
737 (p6)  fma.s1          f_C_lo_temp3 =  f_sneg, f_Tmjlo, f0
738       nop.i           0
742 { .mfi
743       nop.m           0
744       fma.s1          f_peven_temp2 = f_Rsq, f_peven_temp1, f_B1
745       nop.i           0
747 { .mfi
748       nop.m           0
749       fma.s1          f_podd_temp2 = f_Rsq, f_podd_temp1, f_A1
750       nop.i           0
754 // If EXP,
755 // Compute 2^(N-1) * Tjhi and 2^(N-1) * Tjlo
756 { .mfi
757       nop.m           0
758 (p7)  fma.s1          f_Tjhi_spos = f_Tjhi, f_spos, f0
759       nop.i           0
761 { .mfi
762       nop.m           0
763 (p7)  fma.s1          f_Tjlo_spos = f_Tjlo, f_spos, f0
764       nop.i           0
768 { .mfi
769       nop.m           0
770 (p6)  fma.s1          f_C_hi = f_spos, f_Tjhi, f_SC_hi_temp
771       nop.i           0
775 { .mfi
776       nop.m           0
777 (p6)  fms.s1          f_S_hi = f_spos, f_Tjhi, f_SC_hi_temp
778       nop.i           0
780 { .mfi
781       nop.m           0
782 (p6)  fma.s1          f_C_lo_temp4 = f_spos, f_Tjlo, f_C_lo_temp3
783       nop.i           0
787 { .mfi
788       nop.m           0
789       fma.s1          f_peven = f_Rsq, f_peven_temp2, f0
790       nop.i           0
792 { .mfi
793       nop.m           0
794       fma.s1          f_podd = f_podd_temp2, f_Rcub, f_R
795       nop.i           0
799 // If TBL,
800 // C_lo_temp1 =  spos * Tjhi - C_hi
801 // C_lo_temp2 =  sneg * Tmjlo + C_lo_temp1
802 // C_lo_temp2 =  sneg * Tmjlo + (spos * Tjhi - C_hi)
804 { .mfi
805       nop.m           0
806 (p6)  fms.s1          f_C_lo_temp1 =  f_spos, f_Tjhi,  f_C_hi
807       nop.i           0
811 { .mfi
812       nop.m           0
813 (p6)  fma.s1          f_C_lo_temp2 = f_sneg, f_Tmjhi, f_C_lo_temp1
814       nop.i           0
818 // If EXP,
819 // Y_hi = 2^(N-1) * Tjhi
820 // Y_lo = 2^(N-1) * Tjhi * (p_odd + p_even) + 2^(N-1) * Tjlo
821 { .mfi
822       nop.m           0
823 (p7)  fma.s1          f_Y_lo_temp =  f_peven, f1, f_podd
824       nop.i           0
828 // If TBL,
829 // C_lo = C_lo_temp4 + C_lo_temp2
830 { .mfi
831       nop.m           0
832 (p6)  fma.s1          f_C_lo = f_C_lo_temp4, f1, f_C_lo_temp2
833       nop.i           0
837 // If TBL,
838 // Y_hi = C_hi
839 // Y_lo = S_hi*p_odd + (C_hi*p_even + C_lo)
840 { .mfi
841       nop.m           0
842 (p6)  fma.s1          f_Y_lo_temp = f_C_hi, f_peven, f_C_lo
843       nop.i           0
847 { .mfi
848       nop.m           0
849 (p7)  fma.s1          f_Y_lo = f_Tjhi_spos, f_Y_lo_temp, f_Tjlo_spos
850       nop.i           0
854 // Dummy multiply to generate inexact
855 { .mfi
856       nop.m           0
857       fmpy.s0         f_tmp = f_B2, f_B2
858       nop.i           0
860 { .mfi
861       nop.m           0
862 (p6)  fma.s1          f_Y_lo = f_S_hi, f_podd, f_Y_lo_temp
863       nop.i           0
867 // f8 = answer = Y_hi + Y_lo
868 { .mfi
869       nop.m           0
870 (p7)  fma.s0          f8 = f_Y_lo,  f1, f_Tjhi_spos
871       nop.i           0
875 // f8 = answer = Y_hi + Y_lo
876 { .mfb
877       nop.m           0
878 (p6)  fma.s0          f8 = f_Y_lo, f1, f_C_hi
879       br.ret.sptk     b0      // Exit for COSH_BY_TBL and COSH_BY_EXP
884 // Here if 0 < |x| < 0.25
885 COSH_BY_POLY:
886 { .mmf
887       ldfe            f_P6 = [r_ad2e],16
888       ldfe            f_P5 = [r_ad2o],16
889       nop.f           0
893 { .mmi
894       ldfe            f_P4 = [r_ad2e],16
895       ldfe            f_P3 = [r_ad2o],16
896       nop.i           0
900 { .mmi
901       ldfe            f_P2 = [r_ad2e],16
902       ldfe            f_P1 = [r_ad2o],16
903       nop.i           0
907 { .mfi
908       nop.m           0
909       fma.s1          f_X3 = f_NORM_X, f_X2, f0
910       nop.i           0
912 { .mfi
913       nop.m           0
914       fma.s1          f_X4 = f_X2, f_X2, f0
915       nop.i           0
919 { .mfi
920       nop.m           0
921       fma.s1          f_poly65 = f_X2, f_P6, f_P5
922       nop.i           0
924 { .mfi
925       nop.m           0
926       fma.s1          f_poly43 = f_X2, f_P4, f_P3
927       nop.i           0
931 { .mfi
932       nop.m           0
933       fma.s1          f_poly21 = f_X2, f_P2, f_P1
934       nop.i           0
938 { .mfi
939       nop.m           0
940       fma.s1          f_poly6543 = f_X4, f_poly65, f_poly43
941       nop.i           0
945 { .mfi
946       nop.m           0
947       fma.s1          f_poly6to1 = f_X4, f_poly6543, f_poly21
948       nop.i           0
952 // Dummy multiply to generate inexact
953 { .mfi
954       nop.m           0
955       fmpy.s0         f_tmp = f_P6, f_P6
956       nop.i           0
958 { .mfb
959       nop.m           0
960       fma.s0          f8 = f_poly6to1, f_X2, f1
961       br.ret.sptk     b0                // Exit COSH_BY_POLY
966 // Here if x denorm or unorm
967 COSH_DENORM:
968 // Determine if x really a denorm and not a unorm
969 { .mmf
970       getf.exp        r_signexp_x = f_NORM_X
971       mov             r_exp_denorm = 0x0c001   // Real denorms have exp < this
972       fmerge.s        f_ABS_X = f0, f_NORM_X
976 { .mfi
977       nop.m           0
978       fcmp.eq.s0      p10,p0 = f8, f0  // Set denorm flag
979       nop.i           0
983 // Set p8 if really a denorm
984 { .mmi
985       and             r_exp_x = r_exp_mask, r_signexp_x
987       cmp.lt          p8,p9 = r_exp_x, r_exp_denorm
988       nop.i           0
992 // Identify denormal operands.
993 { .mfb
994       nop.m           0
995 (p8)  fma.s0          f8 =  f8,f8,f1 // If x denorm, result=1+x^2
996 (p9)  br.cond.sptk    COSH_COMMON    // Return to main path if x unorm
1000 { .mfb
1001       nop.m           0
1002       nop.f           0
1003       br.ret.sptk     b0            // Exit if x denorm
1008 // Here if |x| >= overflow limit
1009 COSH_HUGE:
1010 // for COSH_HUGE, put 24000 in exponent; take sign from input
1011 { .mmi
1012       mov             r_exp_huge = 0x15dbf
1014       setf.exp        f_huge  = r_exp_huge
1015       nop.i           0
1019 { .mfi
1020       alloc           r32 = ar.pfs,0,5,4,0
1021       fma.s1          f_signed_hi_lo = f_huge, f1, f1
1022       nop.i           0
1026 { .mfi
1027       nop.m           0
1028       fma.s0          f_pre_result = f_signed_hi_lo, f_huge, f0
1029       mov             GR_Parameter_TAG = 63
1033 GLOBAL_IEEE754_END(coshl)
1034 libm_alias_ldouble_other (__cosh, cosh)
1037 LOCAL_LIBM_ENTRY(__libm_error_region)
1038 .prologue
1040 { .mfi
1041         add   GR_Parameter_Y=-32,sp              // Parameter 2 value
1042         nop.f 0
1043 .save   ar.pfs,GR_SAVE_PFS
1044         mov  GR_SAVE_PFS=ar.pfs                  // Save ar.pfs
1046 { .mfi
1047 .fframe 64
1048         add sp=-64,sp                            // Create new stack
1049         nop.f 0
1050         mov GR_SAVE_GP=gp                        // Save gp
1053 { .mmi
1054         stfe [GR_Parameter_Y] = f0,16            // STORE Parameter 2 on stack
1055         add GR_Parameter_X = 16,sp               // Parameter 1 address
1056 .save   b0, GR_SAVE_B0
1057         mov GR_SAVE_B0=b0                        // Save b0
1060 .body
1061 { .mib
1062         stfe [GR_Parameter_X] = f8               // STORE Parameter 1 on stack
1063         add   GR_Parameter_RESULT = 0,GR_Parameter_Y   // Parameter 3 address
1064         nop.b 0
1066 { .mib
1067         stfe [GR_Parameter_Y] = f_pre_result     // STORE Parameter 3 on stack
1068         add   GR_Parameter_Y = -16,GR_Parameter_Y
1069         br.call.sptk b0=__libm_error_support#    // Call error handling function
1072 { .mmi
1073         add   GR_Parameter_RESULT = 48,sp
1074         nop.m 0
1075         nop.i 0
1078 { .mmi
1079         ldfe  f8 = [GR_Parameter_RESULT]         // Get return result off stack
1080 .restore sp
1081         add   sp = 64,sp                         // Restore stack pointer
1082         mov   b0 = GR_SAVE_B0                    // Restore return address
1085 { .mib
1086         mov   gp = GR_SAVE_GP                    // Restore gp
1087         mov   ar.pfs = GR_SAVE_PFS               // Restore ar.pfs
1088         br.ret.sptk     b0                       // Return
1091 LOCAL_LIBM_END(__libm_error_region)
1094 .type   __libm_error_support#,@function
1095 .global __libm_error_support#