Remove support in configure for unsupported architectures
[glibc.git] / sysdeps / ia64 / fpu / s_erf.S
blob7174a197fbb5bf4ce989a6af99bda249ed0abae6
1 .file "erf.s"
4 // Copyright (c) 2001 - 2005, Intel Corporation
5 // All rights reserved.
6 //
7 // Contributed 2001 by the Intel Numerics Group, Intel Corporation
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
13 // * Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
16 // * Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
20 // * The name of Intel Corporation may not be used to endorse or promote
21 // products derived from this software without specific prior written
22 // permission.
24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 
28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 
32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
35 // 
36 // Intel Corporation is the author of this code, and requests that all
37 // problem reports or change requests be submitted to it directly at 
38 // http://www.intel.com/software/products/opensource/libraries/num.htm.
40 // History
41 //==============================================================
42 // 08/15/01 Initial version
43 // 05/20/02 Cleaned up namespace and sf0 syntax
44 // 02/06/03 Reordered header: .section, .global, .proc, .align
45 // 03/31/05 Reformatted delimiters between data tables
47 // API
48 //==============================================================
49 // double erf(double)
51 // Overview of operation
52 //==============================================================
53 // Background
56 // There are 9 paths:
57 // 1. x = +/-0.0
58 //    Return erf(x) = +/-0.0
60 // 2. 0.0 < |x| < 0.5
61 //    Return erf(x) = x *Pol9(x^2)
63 // 3. For several subranges of 0.5 <= |x| < 5.90625
64 //    Return erf(x) = sign(x)*Pol19(y), 
65 //    where y = (|x|-b)/a, Pol19(y) = A0 + A1*y^1 + A2*y^2 + ... + A19*y^19
67 //    For each subrange there is particular set of coefficients.
68 //    Below is the list of subranges:
69 //    3.1 0.5 <= |x| < 1.0     b = a = 0.5
70 //    3.2 1.0 <= |x| < 2.0,    b = a = 1.0
71 //    3.3 2.0 <= |x| < 3.25    b = a = 2.0
72 //    3.4 4.0 <= |x| < 5.90625 b = 4.0, a = 2.0
74 // 4. 3.25 <= |x| < 4.0
75 //    Return erf(x) = sign(x)*Pol14(|x| - 3.25)
77 // 5. 5.90625 <= |x| < +INF
78 //    Return erf(x) = sign(x)*(1.0d - 2^(-63))
80 // 6. |x| = INF
81 //    Return erf(x) = sign(x) * 1.0
83 // 7. x = [S,Q]NaN 
84 //    Return erf(x) = QNaN
86 // 8. x is positive denormal
87 //    Return erf(x) = A0*x - x^2,
88 //    where A0 = 2.0/sqrt(Pi)
90 // 9. x is negative denormal
91 //    Return erf(x) = A0*x + x^2,
92 //    where A0 = 2.0/sqrt(Pi)
94 // Registers used
95 //==============================================================
96 // Floating Point registers used: 
97 // f8, input, output
98 // f32 -> f63
100 // General registers used:  
101 // r32 -> r48, r2, r3
103 // Predicate registers used:
104 // p0, p6 -> p15
106 // p6           to filter out case when x = denormal
107 // p7           to filter out case when x = [Q,S]NaN or +/-0,
108 //              used also to process denormals
109 // p8           to filter out case when 3.25 <= |x| < 4.0, 
110 //              used also to process denormals
111 // p9           to filter out case when |x| = inf
112 // p10          to filter out case when |x| < 0.5
113 // p11          set when |x| < 3.25 or |x| > 4.0
114 // p12          to filter out case when |x| >= 5.90625
115 // p13          set if 4.0 <=|x| < 5.90625
116 // p14          set to 1 for positive x
117 // p15          set to 1 for negative x
119 // Assembly macros
120 //==============================================================
121 rDataPtr           = r2
122 rDataPtr1          = r3
124 rBias              = r33
125 rCoeffAddr3        = r34
126 rThreeAndQ         = r35
127 rCoeffAddr2        = r36
128 rMask              = r37
129 rArg               = r38
130 rSignBit           = r39
131 rAbsArg            = r40
132 rSaturation        = r41
133 rIndex             = r42
134 rCoeffAddr1        = r43
135 rCoeffAddr4        = r44
136 rShiftedArg        = r45
137 rShiftedArgMasked  = r46
138 rBiasedExpOf4      = r47
139 rShiftedAbsArg     = r48
141 //==============================================================
142 fA0                = f32
143 fA1                = f33
144 fA2                = f34
145 fA3                = f35
146 fA4                = f36
147 fA5                = f37
148 fA6                = f38
149 fA7                = f39
150 fA8                = f40
151 fA9                = f41
152 fA10               = f42
153 fA11               = f43
154 fA12               = f44
155 fA13               = f45
156 fA14               = f46
157 fA15               = f47
158 fA16               = f48
159 fA17               = f49
160 fA18               = f50
161 fA19               = f51
162 fArgSqr            = f52
163 fArgAbsNorm        = f53
164 fSignumX           = f54
165 fRes               = f55
166 fThreeAndQ         = f56
167 fArgAbs            = f57
168 fTSqr              = f58
169 fTQuadr            = f59
170 fTDeg3             = f60
171 fTDeg7             = f61
172 fArgAbsNormSgn     = f62                          
173 fTQuadrSgn         = f63
175 // Data tables
176 //==============================================================
177 RODATA
179 .align 64
181 LOCAL_OBJECT_START(erf_data)
182 // Coefficients ##0..15
183 // Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0 
184 data8 0xB69AC40646D1F6C1, 0x00003FD2 //A19
185 data8 0x90AD48C0118FA10C, 0x00003FD7 //A18
186 data8 0x826FBAD055EA4AB8, 0x0000BFDB //A17
187 data8 0x8DAB171246CC2B89, 0x00003FDC //A16
188 data8 0xC0B1D6662F8A7564, 0x00003FDF //A15
189 data8 0xA46374AC35099BAF, 0x0000BFE1 //A14
190 data8 0xB2F230996346EF27, 0x0000BFE4 //A13
191 data8 0xCDEC50950FACE04A, 0x00003FE6 //A12
192 data8 0x826014649396E9D2, 0x00003FE9 //A11
193 data8 0xCDB787DC718B13F9, 0x0000BFEB //A10
194 data8 0x8E0B23C24EE0C8EE, 0x0000BFED //A9
195 data8 0xA49EA40A4E5A3F76, 0x00003FF0 //A8
196 data8 0xB11E30BE912617D3, 0x00003FF0 //A7
197 data8 0xCCF89D9351CE26E3, 0x0000BFF4 //A6
198 data8 0xEFF75AD1F0F22809, 0x00003FF2 //A5
199 data8 0xBB793EF404C09A22, 0x00003FF8 //A4
200 // Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0 
201 data8 0xBAE93FF4174EA59B, 0x00003FE6 //A19
202 data8 0x8A0FD46092F95D44, 0x0000BFEA //A18
203 data8 0xA37B3242B7809E12, 0x00003FEC //A17
204 data8 0xA0330A5CD2E91689, 0x0000BFED //A16
205 data8 0x8E34A678F3497D17, 0x0000BFEC //A15
206 data8 0xAC185D45A2772384, 0x00003FEF //A14
207 data8 0xB0C11347CE7EEDE8, 0x00003FEF //A13
208 data8 0xD3330DC14EA0E4EB, 0x0000BFF2 //A12
209 data8 0xB4A6DFDE578A428F, 0x00003FF1 //A11
210 data8 0xA0B4034310D2D9CB, 0x00003FF5 //A10
211 data8 0xF71662D3132B7759, 0x0000BFF5 //A9
212 data8 0x9C88BF157695E9EC, 0x0000BFF7 //A8
213 data8 0xF84B80EFCA43895D, 0x00003FF8 //A7
214 data8 0x9722D22DA628A17B, 0x00003FF7 //A6
215 data8 0x8DB0A586F8F3381F, 0x0000BFFB //A5
216 data8 0x8DB0A5879F87E5BE, 0x00003FFB //A4
217 // Polynomial coefficients for the erf(x), 2.0 <= |x| < 3.25 
218 data8 0x9C4AF1F3A4B21AFC, 0x00003FF6 //A19
219 data8 0x8D40D5D5DB741AB8, 0x0000BFF9 //A18
220 data8 0xDEBE7099E0A75BA4, 0x00003FFA //A17
221 data8 0xB99A33294D32429D, 0x0000BFFB //A16
222 data8 0x8109D9C7197BC7C9, 0x00003FFB //A15
223 data8 0xC30DE8E2EFC2D760, 0x00003FFA //A14
224 data8 0x80DDA28C5B35DC73, 0x0000BFFC //A13
225 data8 0x9BE4DE5095BACE0D, 0x00003FF9 //A12
226 data8 0xDA4092509EE7D111, 0x00003FFC //A11
227 data8 0x89D98C561B0C9040, 0x0000BFFD //A10
228 data8 0xD20B26EB2F0881D4, 0x0000BFF9 //A9
229 data8 0xD089C56948731561, 0x00003FFD //A8
230 data8 0xDD704DEFFB21B7E7, 0x0000BFFD //A7
231 data8 0xF0C9A6BBDE469115, 0x00003FF9 //A6
232 data8 0xD673A02CB5766633, 0x00003FFD //A5
233 data8 0x8D162CBAD8A12649, 0x0000BFFE //A4
234 // Polynomial coefficients for the erf(x), 4.0 <= |x| < 6.0  
235 data8 0xD4428B75C6FE8FD1, 0x0000BFFC //A19
236 data8 0xF76BE1935675D5C8, 0x00003FFE //A18
237 data8 0xFD6BB3B14AA7A8E6, 0x0000BFFF //A17
238 data8 0x8BE8F573D348DDA4, 0x00004000 //A16
239 data8 0x81E91923A1030502, 0x0000BFFF //A15
240 data8 0xCE7FE87B26CFD286, 0x0000BFFE //A14
241 data8 0x84EF6B4E17404384, 0x00004000 //A13
242 data8 0x91FEF33015404991, 0x0000C000 //A12
243 data8 0xDEDF6A9370747E56, 0x00003FFF //A11
244 data8 0x8397E6FF56CDFD9D, 0x0000BFFF //A10
245 data8 0xFAD1CE912473937B, 0x00003FFD //A9
246 data8 0xC48C1EA8AAA624EA, 0x0000BFFC //A8
247 data8 0xFECAF0097ACF981B, 0x00003FFA //A7
248 data8 0x8829A394065E4B95, 0x0000BFF9 //A6
249 data8 0xED3003E477A53EE7, 0x00003FF6 //A5
250 data8 0xA4C07E9BB3FCB0F3, 0x0000BFF4 //A4
252 // Coefficients ##16..19
253 // Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0 
254 data8 0x95FA98C337005D13, 0x0000BFF9 //A3
255 data8 0xE0F7E524D2808A97, 0x0000BFFB //A2
256 data8 0xE0F7E524D2808A98, 0x00003FFD //A1
257 data8 0x853F7AE0C76E915F, 0x00003FFE //A0
258 // Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0 
259 data8 0x8DB0A587A96ABCF0, 0x00003FFC //A3
260 data8 0xD488F84B7DE18DA8, 0x0000BFFD //A2
261 data8 0xD488F84B7DE12E9C, 0x00003FFD //A1
262 data8 0xD7BB3D3A08445636, 0x00003FFE //A0
263 // Polynomial coefficients for the erf(x), 2.0 <= |x| < 3.25
264 data8 0xC58571D23D5C4B3A, 0x00003FFD //A3
265 data8 0xA94DCF467CD6AFF3, 0x0000BFFC //A2
266 data8 0xA94DCF467CD10A16, 0x00003FFA //A1
267 data8 0xFECD70A13CAF1997, 0x00003FFE //A0 
268 // Polynomial coefficients for the erf(x), 4.0 <= |x| < 6.0 
269 data8 0xB01D2B4F0D5AB8B0, 0x00003FF1 //A3
270 data8 0x8858A465CE594BD1, 0x0000BFEE //A2
271 data8 0x8858A447456DE61D, 0x00003FEA //A1
272 data8 0xFFFFFFBDC88BB107, 0x00003FFE //A0
273 // Polynomial coefficients for the erf(x), 0.0 <= |x| < 0.5 
274 data8 0xBE839EDBB36C7FCE //A9
275 data8 0x3EBB7745A18DD242 //A8
276 data8 0xBF4C02DB238F2AFC //A5
277 data8 0x3F7565BCD0A9A3EA //A4
278 data8 0xC093A3581BCF3333, 0x0000BFFD //A1
279 data8 0xBEEF4BB82AD8AE22 //A7
280 data8 0x3F1F9A2A57A218CD //A6
281 data8 0xBF9B82CE3127F4E4 //A3
282 data8 0x3FBCE2F21A042B25 //A2
283 data8 0x906EBA8214DB688D, 0x00003FFF //A0
284 // 1.0 - 2^(-63)
285 data8 0xFFFFFFFFFFFFFFFF, 0x00003FFE 
286 // Polynomial coefficients for the erf(x), 3.25 <= |x| < 4.0 
287 data8 0x95E91576C7A12250, 0x00003FE7 //A14
288 data8 0x8E5E0D0E1F5D3CB5, 0x0000BFEA //A13
289 data8 0xED761DAFAF814DE9, 0x00003FEB //A12
290 data8 0xB3A77D921D0ACFC7, 0x0000BFEC //A11
291 data8 0xA662D27096B08D7C, 0x0000BFEC //A10
292 data8 0xDA0F410AE6233EA5, 0x00003FEF //A9
293 data8 0xAB4A8B16B3124327, 0x0000BFF1 //A8
294 data8 0xB241E236A5EDCED3, 0x00003FF2 //A7
295 data8 0x8A2A65BA1F551F77, 0x0000BFF3 //A6
296 data8 0xA4852D0B1D87000A, 0x00003FF3 //A5
297 data8 0x963EB00039489476, 0x0000BFF3 //A4
298 data8 0xCD5244FF4F7313A5, 0x00003FF2 //A3
299 data8 0xC6F1E695363BCB26, 0x0000BFF1 //A2
300 data8 0xF4DAF4680DA54C02, 0x00003FEF //A1
301 data8 0xFFFFB7CFB3F2ABBE, 0x00003FFE //A0
302 // A = 2.0/sqrt(Pi)
303 data8 0x906EBA8214DB688D, 0x00003FFF 
304 LOCAL_OBJECT_END(erf_data)
307 .section .text
308 GLOBAL_LIBM_ENTRY(erf)
310 { .mfi
311       alloc          r32 = ar.pfs, 0, 17, 0, 0
312       fmerge.se      fArgAbsNorm = f1, f8         // normalized x
313       adds           rSignBit = 0x1, r0
315 { .mfi
316       addl           rDataPtr = @ltoff(erf_data), gp
317       fma.s1         fArgSqr = f8, f8, f0         // x^2
318       addl           rThreeAndQ = 0x400A0, r0     // shifted bits of 3.25
321 { .mfi
322       getf.d         rArg = f8                    // x in GR 
323       fclass.m       p6,p0 = f8, 0x0b             // is x denormal ?
324       shl            rThreeAndQ = rThreeAndQ, 44  // bits of 3.25
326 { .mfi
327       ld8            rDataPtr = [rDataPtr]
328       nop.f          0
329       addl           rBiasedExpOf4 = 0x40100, r0  // shifted bits of 4.0
332 { .mfi
333       addl           rSaturation = 0x4017A, r0    // shifted bits of 5.90625
334       fclass.m       p7,p0 = f8, 0xc7             // is x [S,Q]NaN or +/-0 ?
335       shl            rSignBit = rSignBit, 63      // mask for sign bit
337 { .mfi
338       addl           rMask = 0x7FF00, r0          // Mask for index bits
339       nop.f          0
340       addl           rBias = 0x3FE00, r0          // bias of 0.5 << 8
343 { .mfi
344       setf.d         fThreeAndQ = rThreeAndQ      // 3.25 if FP register
345       fclass.m       p9,p0 = f8, 0x23             // is x +/- inf?
346       shr.u          rShiftedArg = rArg, 44
348 { .mfb
349       andcm          rAbsArg = rArg, rSignBit     // |x| in GR
350       nop.f          0
351 (p6)  br.cond.spnt   erf_denormal                 // branch out if x is denormal
353 ;;   
354 { .mfi
355       and            rShiftedArgMasked = rShiftedArg, rMask // bias of x << 8
356       fmerge.s       fArgAbs = f1, f8             // |x|
357       shr            rShiftedAbsArg = rAbsArg, 44
359 { .mfb
360       cmp.lt         p8, p11 = rThreeAndQ, rAbsArg // p8 = 1 if |x| >= 3.25
361 (p7)  fma.d.s0       f8 = f8,f1,f8                // NaN or +/-0
362 (p7)  br.ret.spnt    b0                           // exit for x = NaN or +/-0
364 ;;              
365 { .mfi
366       sub            rIndex = rShiftedArgMasked, rBias // index << 8
367       nop.f          0 
368       cmp.lt         p10, p0 = rShiftedArgMasked, rBias // p10 = 1 if |x| < 0.5 
370 { .mfb
371       // p8 = 1 if 3.25 <= |x| < 4.0 
372 (p8)  cmp.lt         p8, p11 = rShiftedAbsArg, rBiasedExpOf4 
373       fms.s1         fArgAbsNorm = fArgAbsNorm, f1, f1
374 (p10) br.cond.spnt   erf_near_zero // branch out if |x| < 0.5
377 .pred.rel "mutex", p8, p11
378 { .mfi
379 (p8)  adds           rCoeffAddr1 = 1392, rDataPtr // coeff. for 3.25 <=|x|<4.0
380 (p9)  fmerge.s       f8 = f8,f1                   // +/- inf
381       nop.i          0
383 { .mfb
384 (p11) add            rCoeffAddr1 = rDataPtr, rIndex// coeff. ##0,2,..14
385       nop.f          0
386 (p9)  br.ret.spnt    b0                            // exit for x = +/- inf
389 { .mfi
390       adds           rCoeffAddr2 = 16, rCoeffAddr1 
391       fmerge.s       fSignumX = f8, f1            // signum(x)
392       nop.i          0
394 { .mfb
395       cmp.lt         p12, p0 = rSaturation, rShiftedAbsArg // |x| > 5.90625?
396       nop.f          0
397 (p12) br.cond.spnt   erf_saturation               // branch out if x |x| >= 6.0
400 // Here if paths #3,4
401 // if path #4 we'll branch out after loading of 14 necessary coefficients
402 {.mfi
403       ldfe           fA19 = [rCoeffAddr1], 32
404       nop.f          0
405       nop.i          0
407 {.mfi
408       ldfe           fA18 = [rCoeffAddr2], 32
409       nop.f          0
410       adds           rCoeffAddr3 = 1024, rDataPtr
413 {.mfi
414       ldfe           fA17 = [rCoeffAddr1], 32
415       nop.f          0
416       nop.i          0
418 {.mfi
419       ldfe           fA16 = [rCoeffAddr2], 32
420       nop.f          0
421       nop.i          0
424 {.mfi
425       ldfe           fA15 = [rCoeffAddr1], 32
426       fma.s1         fTSqr = fArgAbsNorm, fArgAbsNorm, f0
427       shr.u          rIndex = rIndex, 2
429 {.mfi
430       ldfe           fA14 = [rCoeffAddr2], 32
431       nop.f          0
432       adds           rCoeffAddr4 = 16, r0
435 {.mfi
436       ldfe           fA13 = [rCoeffAddr1], 32
437       nop.f          0
438       // address of coefficients ##16..23
439       add            rCoeffAddr3 = rCoeffAddr3, rIndex 
441 {.mfi
442       ldfe           fA12 = [rCoeffAddr2], 32
443       nop.f          0
444       cmp.lt         p15, p14 = rArg, r0
447 {.mfi
448       ldfe           fA11 = [rCoeffAddr1], 32
449       nop.f          0
450       add            rCoeffAddr4 = rCoeffAddr3, rCoeffAddr4
452 {.mfi
453       ldfe           fA10 = [rCoeffAddr2], 32
454       nop.f          0
455       nop.i          0
458 {.mfi
459       ldfe           fA9 = [rCoeffAddr1], 32
460       nop.f          0
461       nop.i          0
463 {.mfi
464       ldfe           fA8 = [rCoeffAddr2], 32
465       nop.f          0
466       nop.i          0
469 {.mfi
470       ldfe           fA7 = [rCoeffAddr1], 32
471       fms.s1         fArgAbs = fArgAbs, f1, fThreeAndQ
472       nop.i          0
474 {.mfb
475       ldfe           fA6 = [rCoeffAddr2], 32
476       nop.f          0
477 (p8)  br.cond.spnt   erf_3q_4 // branch out if  3.25 < |x| < 4.0
478 }                                    
480 {.mfi
481       ldfe           fA5 = [rCoeffAddr1], 32
482       fma.s1         fTDeg3 = fArgAbsNorm, fTSqr, f0
483       nop.i          0
485 {.mfi
486       ldfe           fA4 = [rCoeffAddr2], 32
487       fma.s1         fTQuadr = fTSqr, fTSqr, f0
488       nop.i          0
491 // Path #3 Polynomial Pol19(y) computation; y = fArgAbsNorm
492 {.mfi
493       ldfe           fA3 = [rCoeffAddr3], 32
494       fma.s1         fArgAbsNormSgn = fArgAbsNorm, fSignumX, f0
495       nop.i          0
497 {.mfi
498       ldfe           fA2 = [rCoeffAddr4], 32
499       nop.f          0
500       nop.i          0
503 {.mfi
504       ldfe           fA1 = [rCoeffAddr3], 32
505       fma.s1         fRes = fA19, fArgAbsNorm, fA18
506       nop.i          0
508 {.mfi
509       ldfe           fA0 = [rCoeffAddr4], 32
510       nop.f          0
511       nop.i          0
514 { .mfi
515       nop.m          0
516       fma.s1         fA17 = fA17, fArgAbsNorm, fA16
517       nop.i          0
520 { .mfi
521       nop.m          0
522       fma.s1         fA15 = fA15, fArgAbsNorm, fA14
523       nop.i          0
526 { .mfi
527       nop.m          0
528       fma.s1         fTDeg7 = fTDeg3, fTQuadr, f0
529       nop.i          0
531 { .mfi
532       nop.m          0
533       fma.s1         fA13 = fA13, fArgAbsNorm, fA12
534       nop.i          0
537 { .mfi
538       nop.m          0
539       fma.s1         fA11 = fA11, fArgAbsNorm, fA10
540       nop.i          0
543 { .mfi
544       nop.m          0
545       fma.s1         fA9 = fA9, fArgAbsNorm, fA8
546       nop.i          0
549 { .mfi
550       nop.m          0
551       fma.s1         fRes = fRes, fTSqr, fA17
552       nop.i          0
554 { .mfi
555       nop.m          0
556       fma.s1         fA7 = fA7, fArgAbsNorm, fA6
557       nop.i          0
560 { .mfi
561       nop.m          0
562       fma.s1         fA5 = fA5, fArgAbsNorm, f0
563       nop.i          0
566 { .mfi
567       nop.m          0
568       fma.s1         fA15 = fA15, fTSqr, fA13  
569       nop.i          0
571 { .mfi
572       nop.m          0
573       fma.s1         fA4 = fA4, fArgAbsNorm, fA3
574       nop.i          0
577 { .mfi
578       nop.m          0
579       fma.s1         fA2 = fA2, fArgAbsNorm, fA1
580       nop.i          0
583 { .mfi
584       nop.m          0
585       fma.s1         fA11 = fA11, fTSqr, fA9
586       nop.i          0
589 { .mfi
590       nop.m          0                                       
591       fma.s1         fA7 = fA7, fTSqr, fA5
592       nop.i          0
595 { .mfi
596       nop.m          0                                       
597       fma.s1         fRes = fRes, fTQuadr, fA15
598       nop.i          0
601 { .mfi
602       nop.m          0                                       
603       fma.s1         fA4 = fA4, fTSqr, fA2
604       nop.i          0
607 { .mfi
608       nop.m          0
609       fma.s1         fRes = fRes, fTQuadr, fA11
610       nop.i          0
613 { .mfi
614       nop.m          0                                       
615       fma.s1         fA4 = fA7, fTDeg3, fA4
616       nop.i          0
619 { .mfi
620       nop.m          0
621       fma.s1         fRes = fRes,  fTDeg7, fA4
622       nop.i          0
625 { .mfi
626       nop.m          0
627       // result for negative argument
628 (p15) fms.d.s0       f8 = fRes, fArgAbsNormSgn, fA0
629       nop.i          0
631 { .mfb
632       nop.m          0
633       // result for positive argument
634 (p14) fma.d.s0       f8 = fRes, fArgAbsNormSgn, fA0
635       br.ret.sptk    b0
638 // Here if  3.25 < |x| < 4.0
639 .align 32
640 erf_3q_4:                                   
641 .pred.rel "mutex", p14, p15
642 { .mfi
643       ldfe           fA5 = [rCoeffAddr1], 32
644       fma.s1         fTSqr = fArgAbs, fArgAbs, f0
645       nop.i          0
647 { .mfi
648       nop.m          0
649       fma.s1         fRes = fA19, fArgAbs, fA18
650       nop.i          0
653 { .mfi
654       nop.m          0
655       fma.s1         fA17 = fA17, fArgAbs, fA16
656       nop.i          0
658 { .mfi
659       nop.m          0
660       fma.s1         fA15 = fA15, fArgAbs, fA14
661       nop.i          0
663 ;;      
664 { .mfi
665       nop.m          0
666       fma.s1         fA13 = fA13, fArgAbs, fA12
667       nop.i          0
669 { .mfi
670       nop.m          0
671       fma.s1         fA11 = fA11, fArgAbs, fA10
672       nop.i          0
674 ;;     
675 { .mfi
676       nop.m          0
677       fma.s1         fA9 = fA9, fArgAbs, fA8
678       nop.i          0
680 { .mfi
681       nop.m          0
682       fma.s1         fArgAbsNormSgn = fArgAbs, fSignumX, f0
683       nop.i          0
685 ;;     
686 { .mfi
687       nop.m          0
688       fma.s1         fTQuadr = fTSqr, fTSqr, f0
689       nop.i          0
692 { .mfi
693       nop.m          0
694       fma.s1         fRes = fRes, fTSqr, fA17
695       nop.i          0
697 ;;  
698 { .mfi
699       nop.m          0
700       fma.s1         fA15 = fA15, fTSqr, fA13
701       nop.i          0
703 ;;  
704 { .mfi
705       nop.m          0
706       fma.s1         fA11 = fA11, fTSqr, fA9
707       nop.i          0
708 }  
709 { .mfi
710       nop.m          0
711       fma.s1         fA7 = fA7, fArgAbs, fA6
712       nop.i          0
714 ;; 
715 { .mfi
716       nop.m          0
717       fma.s1         fTDeg7 = fTQuadr, fTSqr, f0
718       nop.i          0
720 { .mfi
721       nop.m          0
722       fma.s1         fRes = fRes, fTQuadr, fA15
723       nop.i          0
725 ;; 
726 { .mfi
727       nop.m          0
728       fma.s1         fA11 = fA11, fTSqr, fA7 
729       nop.i          0
732 { .mfi
733       nop.m          0
734       fma.s1         fRes = fRes, fTDeg7, fA11
735       nop.i          0
737 ;; 
738 { .mfi
739       nop.m          0
740       // result for negative argument
741 (p15) fms.d.s0       f8 = fRes, fArgAbsNormSgn, fA5
742       nop.i          0
744 { .mfb
745       nop.m          0
746       // result for positive argument
747 (p14) fma.d.s0       f8 = fRes, fArgAbsNormSgn, fA5
748       br.ret.sptk    b0
752 // Here if |x| < 0.5
753 .align 32
754 erf_near_zero:
755 { .mfi
756       adds           rCoeffAddr1 = 1280, rDataPtr // address of A9
757       fma.s1         fTSqr = fArgSqr, fArgSqr, f0 // x^4 
758       nop.i          0
760 { .mfi
761       adds           rCoeffAddr2 = 1328, rDataPtr // address of A7
762       nop.f          0
763       nop.i          0
766 { .mfi
767       ldfpd          fA9, fA8 = [rCoeffAddr1], 16
768       nop.f          0
769       nop.i          0
771 { .mfi
772       ldfpd          fA7, fA6 = [rCoeffAddr2], 16
773       nop.f          0
774       nop.i          0
777 { .mfi
778       ldfpd          fA5, fA4 = [rCoeffAddr1], 16
779       nop.f          0
780       nop.i          0
782 { .mfi
783       ldfpd          fA3, fA2 = [rCoeffAddr2], 16
784       nop.f          0
785       nop.i          0
788 { .mfi
789       ldfe           fA1 = [rCoeffAddr1]
790       nop.f          0
791       nop.i          0
793 { .mfi
794       ldfe           fA0 = [rCoeffAddr2]
795       nop.f          0
796       nop.i          0
799 { .mfi
800       nop.m          0
801       fma.s1         fTQuadr = fTSqr, fTSqr, f0
802       nop.i          0
805 { .mfi
806       nop.m          0
807       fma.s1         fRes = fA9, fArgSqr, fA8
808       nop.i          0
810 { .mfi
811       nop.m          0
812       fma.s1         fA7 = fA7, fArgSqr, fA6
813       nop.i          0
816 { .mfi
817       nop.m          0
818       fma.s1         fA3 = fA3, fArgSqr, fA2
819       nop.i          0
821 { .mfi
822       nop.m          0
823       fma.s1         fA5 = fA5, fArgSqr, fA4
824       nop.i          0
827 { .mfi
828       nop.m          0
829       fma.s1         fA1 = fA1, fArgSqr, fA0
830       nop.i          0
832 { .mfi
833       nop.m          0
834       fma.s1         fTQuadrSgn = fTQuadr, f8, f0
835       nop.i          0
838 { .mfi
839       nop.m          0
840       fma.s1         fRes = fRes, fTSqr, fA7
841       nop.i          0
844 { .mfi
845       nop.m          0
846       fma.s1         fA1 = fA3, fTSqr, fA1
847       nop.i          0
850 { .mfi
851       nop.m          0
852       fma.s1         fRes = fRes, fTSqr, fA5
853       nop.i          0
856 { .mfi
857       nop.m          0
858       fma.s1         fA1 = fA1, f8, f0
859       nop.i          0
862 { .mfb
863       nop.m          0
864       fma.d.s0       f8 = fRes, fTQuadrSgn, fA1 // x*Pol9(x^2)
865       br.ret.sptk    b0                              // Exit for |x| < 0.5
868 // Here if 5.90625 <= |x| < +inf
869 .align 32
870 erf_saturation:
871 { .mfi
872       adds           rDataPtr = 1376, rDataPtr     // address of A0
873       nop.f          0
874       nop.i          0
877 { .mfi
878       ldfe           fA0 = [rDataPtr]
879       nop.f          0
880       nop.i          0
883 { .mfb
884       nop.m          0
885       fma.d.s0       f8 = fA0, fSignumX, f0       // sign(x)*(1.0 - 2^(-63))
886       // Exit for 5.90625 <= |x| < +inf
887       br.ret.sptk    b0                          // Exit for 5.90625 <=|x|< +inf
890       
891 // Here if x is double precision denormal
892 .align 32
893 erf_denormal:
894 { .mfi
895       adds           rDataPtr = 1632, rDataPtr    // address of A0
896       fclass.m       p7,p8 = f8, 0x0a             // is x -denormal ?
897       nop.i          0
900 { .mfi
901       ldfe           fA0 = [rDataPtr]             // A0
902       nop.f          0
903       nop.i          0
906 { .mfi
907       nop.m          0
908       fma.s1         fA0 = fA0,f8,f0              // A0*x
909       nop.i          0
912 { .mfi
913       nop.m          0
914 (p7)  fma.d.s0       f8 = f8,f8,fA0               // -denormal
915       nop.i          0
917 { .mfb
918       nop.m          0
919 (p8)  fnma.d.s0      f8 = f8,f8,fA0               // +denormal
920       br.ret.sptk    b0                           // Exit for denormal
924 GLOBAL_LIBM_END(erf)