Update copyright dates with scripts/update-copyrights
[glibc.git] / sysdeps / ia64 / fpu / s_erf.S
blobfefdefdf2dbcd3b1341e59d2082656bdf562399a
1 .file "erf.s"
4 // Copyright (c) 2001 - 2005, Intel Corporation
5 // All rights reserved.
6 //
7 //
8 // Redistribution and use in source and binary forms, with or without
9 // modification, are permitted provided that the following conditions are
10 // met:
12 // * Redistributions of source code must retain the above copyright
13 // notice, this list of conditions and the following disclaimer.
15 // * Redistributions in binary form must reproduce the above copyright
16 // notice, this list of conditions and the following disclaimer in the
17 // documentation and/or other materials provided with the distribution.
19 // * The name of Intel Corporation may not be used to endorse or promote
20 // products derived from this software without specific prior written
21 // permission.
23 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 // Intel Corporation is the author of this code, and requests that all
36 // problem reports or change requests be submitted to it directly at
37 // http://www.intel.com/software/products/opensource/libraries/num.htm.
39 // History
40 //==============================================================
41 // 08/15/01 Initial version
42 // 05/20/02 Cleaned up namespace and sf0 syntax
43 // 02/06/03 Reordered header: .section, .global, .proc, .align
44 // 03/31/05 Reformatted delimiters between data tables
46 // API
47 //==============================================================
48 // double erf(double)
50 // Overview of operation
51 //==============================================================
52 // Background
55 // There are 9 paths:
56 // 1. x = +/-0.0
57 //    Return erf(x) = +/-0.0
59 // 2. 0.0 < |x| < 0.5
60 //    Return erf(x) = x *Pol9(x^2)
62 // 3. For several subranges of 0.5 <= |x| < 5.90625
63 //    Return erf(x) = sign(x)*Pol19(y),
64 //    where y = (|x|-b)/a, Pol19(y) = A0 + A1*y^1 + A2*y^2 + ... + A19*y^19
66 //    For each subrange there is particular set of coefficients.
67 //    Below is the list of subranges:
68 //    3.1 0.5 <= |x| < 1.0     b = a = 0.5
69 //    3.2 1.0 <= |x| < 2.0,    b = a = 1.0
70 //    3.3 2.0 <= |x| < 3.25    b = a = 2.0
71 //    3.4 4.0 <= |x| < 5.90625 b = 4.0, a = 2.0
73 // 4. 3.25 <= |x| < 4.0
74 //    Return erf(x) = sign(x)*Pol14(|x| - 3.25)
76 // 5. 5.90625 <= |x| < +INF
77 //    Return erf(x) = sign(x)*(1.0d - 2^(-63))
79 // 6. |x| = INF
80 //    Return erf(x) = sign(x) * 1.0
82 // 7. x = [S,Q]NaN
83 //    Return erf(x) = QNaN
85 // 8. x is positive denormal
86 //    Return erf(x) = A0*x - x^2,
87 //    where A0 = 2.0/sqrt(Pi)
89 // 9. x is negative denormal
90 //    Return erf(x) = A0*x + x^2,
91 //    where A0 = 2.0/sqrt(Pi)
93 // Registers used
94 //==============================================================
95 // Floating Point registers used:
96 // f8, input, output
97 // f32 -> f63
99 // General registers used:
100 // r32 -> r48, r2, r3
102 // Predicate registers used:
103 // p0, p6 -> p15
105 // p6           to filter out case when x = denormal
106 // p7           to filter out case when x = [Q,S]NaN or +/-0,
107 //              used also to process denormals
108 // p8           to filter out case when 3.25 <= |x| < 4.0,
109 //              used also to process denormals
110 // p9           to filter out case when |x| = inf
111 // p10          to filter out case when |x| < 0.5
112 // p11          set when |x| < 3.25 or |x| > 4.0
113 // p12          to filter out case when |x| >= 5.90625
114 // p13          set if 4.0 <=|x| < 5.90625
115 // p14          set to 1 for positive x
116 // p15          set to 1 for negative x
118 // Assembly macros
119 //==============================================================
120 rDataPtr           = r2
121 rDataPtr1          = r3
123 rBias              = r33
124 rCoeffAddr3        = r34
125 rThreeAndQ         = r35
126 rCoeffAddr2        = r36
127 rMask              = r37
128 rArg               = r38
129 rSignBit           = r39
130 rAbsArg            = r40
131 rSaturation        = r41
132 rIndex             = r42
133 rCoeffAddr1        = r43
134 rCoeffAddr4        = r44
135 rShiftedArg        = r45
136 rShiftedArgMasked  = r46
137 rBiasedExpOf4      = r47
138 rShiftedAbsArg     = r48
140 //==============================================================
141 fA0                = f32
142 fA1                = f33
143 fA2                = f34
144 fA3                = f35
145 fA4                = f36
146 fA5                = f37
147 fA6                = f38
148 fA7                = f39
149 fA8                = f40
150 fA9                = f41
151 fA10               = f42
152 fA11               = f43
153 fA12               = f44
154 fA13               = f45
155 fA14               = f46
156 fA15               = f47
157 fA16               = f48
158 fA17               = f49
159 fA18               = f50
160 fA19               = f51
161 fArgSqr            = f52
162 fArgAbsNorm        = f53
163 fSignumX           = f54
164 fRes               = f55
165 fThreeAndQ         = f56
166 fArgAbs            = f57
167 fTSqr              = f58
168 fTQuadr            = f59
169 fTDeg3             = f60
170 fTDeg7             = f61
171 fArgAbsNormSgn     = f62
172 fTQuadrSgn         = f63
174 // Data tables
175 //==============================================================
176 RODATA
178 .align 64
180 LOCAL_OBJECT_START(erf_data)
181 // Coefficients ##0..15
182 // Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0
183 data8 0xB69AC40646D1F6C1, 0x00003FD2 //A19
184 data8 0x90AD48C0118FA10C, 0x00003FD7 //A18
185 data8 0x826FBAD055EA4AB8, 0x0000BFDB //A17
186 data8 0x8DAB171246CC2B89, 0x00003FDC //A16
187 data8 0xC0B1D6662F8A7564, 0x00003FDF //A15
188 data8 0xA46374AC35099BAF, 0x0000BFE1 //A14
189 data8 0xB2F230996346EF27, 0x0000BFE4 //A13
190 data8 0xCDEC50950FACE04A, 0x00003FE6 //A12
191 data8 0x826014649396E9D2, 0x00003FE9 //A11
192 data8 0xCDB787DC718B13F9, 0x0000BFEB //A10
193 data8 0x8E0B23C24EE0C8EE, 0x0000BFED //A9
194 data8 0xA49EA40A4E5A3F76, 0x00003FF0 //A8
195 data8 0xB11E30BE912617D3, 0x00003FF0 //A7
196 data8 0xCCF89D9351CE26E3, 0x0000BFF4 //A6
197 data8 0xEFF75AD1F0F22809, 0x00003FF2 //A5
198 data8 0xBB793EF404C09A22, 0x00003FF8 //A4
199 // Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0
200 data8 0xBAE93FF4174EA59B, 0x00003FE6 //A19
201 data8 0x8A0FD46092F95D44, 0x0000BFEA //A18
202 data8 0xA37B3242B7809E12, 0x00003FEC //A17
203 data8 0xA0330A5CD2E91689, 0x0000BFED //A16
204 data8 0x8E34A678F3497D17, 0x0000BFEC //A15
205 data8 0xAC185D45A2772384, 0x00003FEF //A14
206 data8 0xB0C11347CE7EEDE8, 0x00003FEF //A13
207 data8 0xD3330DC14EA0E4EB, 0x0000BFF2 //A12
208 data8 0xB4A6DFDE578A428F, 0x00003FF1 //A11
209 data8 0xA0B4034310D2D9CB, 0x00003FF5 //A10
210 data8 0xF71662D3132B7759, 0x0000BFF5 //A9
211 data8 0x9C88BF157695E9EC, 0x0000BFF7 //A8
212 data8 0xF84B80EFCA43895D, 0x00003FF8 //A7
213 data8 0x9722D22DA628A17B, 0x00003FF7 //A6
214 data8 0x8DB0A586F8F3381F, 0x0000BFFB //A5
215 data8 0x8DB0A5879F87E5BE, 0x00003FFB //A4
216 // Polynomial coefficients for the erf(x), 2.0 <= |x| < 3.25
217 data8 0x9C4AF1F3A4B21AFC, 0x00003FF6 //A19
218 data8 0x8D40D5D5DB741AB8, 0x0000BFF9 //A18
219 data8 0xDEBE7099E0A75BA4, 0x00003FFA //A17
220 data8 0xB99A33294D32429D, 0x0000BFFB //A16
221 data8 0x8109D9C7197BC7C9, 0x00003FFB //A15
222 data8 0xC30DE8E2EFC2D760, 0x00003FFA //A14
223 data8 0x80DDA28C5B35DC73, 0x0000BFFC //A13
224 data8 0x9BE4DE5095BACE0D, 0x00003FF9 //A12
225 data8 0xDA4092509EE7D111, 0x00003FFC //A11
226 data8 0x89D98C561B0C9040, 0x0000BFFD //A10
227 data8 0xD20B26EB2F0881D4, 0x0000BFF9 //A9
228 data8 0xD089C56948731561, 0x00003FFD //A8
229 data8 0xDD704DEFFB21B7E7, 0x0000BFFD //A7
230 data8 0xF0C9A6BBDE469115, 0x00003FF9 //A6
231 data8 0xD673A02CB5766633, 0x00003FFD //A5
232 data8 0x8D162CBAD8A12649, 0x0000BFFE //A4
233 // Polynomial coefficients for the erf(x), 4.0 <= |x| < 6.0
234 data8 0xD4428B75C6FE8FD1, 0x0000BFFC //A19
235 data8 0xF76BE1935675D5C8, 0x00003FFE //A18
236 data8 0xFD6BB3B14AA7A8E6, 0x0000BFFF //A17
237 data8 0x8BE8F573D348DDA4, 0x00004000 //A16
238 data8 0x81E91923A1030502, 0x0000BFFF //A15
239 data8 0xCE7FE87B26CFD286, 0x0000BFFE //A14
240 data8 0x84EF6B4E17404384, 0x00004000 //A13
241 data8 0x91FEF33015404991, 0x0000C000 //A12
242 data8 0xDEDF6A9370747E56, 0x00003FFF //A11
243 data8 0x8397E6FF56CDFD9D, 0x0000BFFF //A10
244 data8 0xFAD1CE912473937B, 0x00003FFD //A9
245 data8 0xC48C1EA8AAA624EA, 0x0000BFFC //A8
246 data8 0xFECAF0097ACF981B, 0x00003FFA //A7
247 data8 0x8829A394065E4B95, 0x0000BFF9 //A6
248 data8 0xED3003E477A53EE7, 0x00003FF6 //A5
249 data8 0xA4C07E9BB3FCB0F3, 0x0000BFF4 //A4
251 // Coefficients ##16..19
252 // Polynomial coefficients for the erf(x), 0.5 <= |x| < 1.0
253 data8 0x95FA98C337005D13, 0x0000BFF9 //A3
254 data8 0xE0F7E524D2808A97, 0x0000BFFB //A2
255 data8 0xE0F7E524D2808A98, 0x00003FFD //A1
256 data8 0x853F7AE0C76E915F, 0x00003FFE //A0
257 // Polynomial coefficients for the erf(x), 1.0 <= |x| < 2.0
258 data8 0x8DB0A587A96ABCF0, 0x00003FFC //A3
259 data8 0xD488F84B7DE18DA8, 0x0000BFFD //A2
260 data8 0xD488F84B7DE12E9C, 0x00003FFD //A1
261 data8 0xD7BB3D3A08445636, 0x00003FFE //A0
262 // Polynomial coefficients for the erf(x), 2.0 <= |x| < 3.25
263 data8 0xC58571D23D5C4B3A, 0x00003FFD //A3
264 data8 0xA94DCF467CD6AFF3, 0x0000BFFC //A2
265 data8 0xA94DCF467CD10A16, 0x00003FFA //A1
266 data8 0xFECD70A13CAF1997, 0x00003FFE //A0
267 // Polynomial coefficients for the erf(x), 4.0 <= |x| < 6.0
268 data8 0xB01D2B4F0D5AB8B0, 0x00003FF1 //A3
269 data8 0x8858A465CE594BD1, 0x0000BFEE //A2
270 data8 0x8858A447456DE61D, 0x00003FEA //A1
271 data8 0xFFFFFFBDC88BB107, 0x00003FFE //A0
272 // Polynomial coefficients for the erf(x), 0.0 <= |x| < 0.5
273 data8 0xBE839EDBB36C7FCE //A9
274 data8 0x3EBB7745A18DD242 //A8
275 data8 0xBF4C02DB238F2AFC //A5
276 data8 0x3F7565BCD0A9A3EA //A4
277 data8 0xC093A3581BCF3333, 0x0000BFFD //A1
278 data8 0xBEEF4BB82AD8AE22 //A7
279 data8 0x3F1F9A2A57A218CD //A6
280 data8 0xBF9B82CE3127F4E4 //A3
281 data8 0x3FBCE2F21A042B25 //A2
282 data8 0x906EBA8214DB688D, 0x00003FFF //A0
283 // 1.0 - 2^(-63)
284 data8 0xFFFFFFFFFFFFFFFF, 0x00003FFE
285 // Polynomial coefficients for the erf(x), 3.25 <= |x| < 4.0
286 data8 0x95E91576C7A12250, 0x00003FE7 //A14
287 data8 0x8E5E0D0E1F5D3CB5, 0x0000BFEA //A13
288 data8 0xED761DAFAF814DE9, 0x00003FEB //A12
289 data8 0xB3A77D921D0ACFC7, 0x0000BFEC //A11
290 data8 0xA662D27096B08D7C, 0x0000BFEC //A10
291 data8 0xDA0F410AE6233EA5, 0x00003FEF //A9
292 data8 0xAB4A8B16B3124327, 0x0000BFF1 //A8
293 data8 0xB241E236A5EDCED3, 0x00003FF2 //A7
294 data8 0x8A2A65BA1F551F77, 0x0000BFF3 //A6
295 data8 0xA4852D0B1D87000A, 0x00003FF3 //A5
296 data8 0x963EB00039489476, 0x0000BFF3 //A4
297 data8 0xCD5244FF4F7313A5, 0x00003FF2 //A3
298 data8 0xC6F1E695363BCB26, 0x0000BFF1 //A2
299 data8 0xF4DAF4680DA54C02, 0x00003FEF //A1
300 data8 0xFFFFB7CFB3F2ABBE, 0x00003FFE //A0
301 // A = 2.0/sqrt(Pi)
302 data8 0x906EBA8214DB688D, 0x00003FFF
303 LOCAL_OBJECT_END(erf_data)
306 .section .text
307 GLOBAL_LIBM_ENTRY(erf)
309 { .mfi
310       alloc          r32 = ar.pfs, 0, 17, 0, 0
311       fmerge.se      fArgAbsNorm = f1, f8         // normalized x
312       adds           rSignBit = 0x1, r0
314 { .mfi
315       addl           rDataPtr = @ltoff(erf_data), gp
316       fma.s1         fArgSqr = f8, f8, f0         // x^2
317       addl           rThreeAndQ = 0x400A0, r0     // shifted bits of 3.25
320 { .mfi
321       getf.d         rArg = f8                    // x in GR
322       fclass.m       p6,p0 = f8, 0x0b             // is x denormal ?
323       shl            rThreeAndQ = rThreeAndQ, 44  // bits of 3.25
325 { .mfi
326       ld8            rDataPtr = [rDataPtr]
327       nop.f          0
328       addl           rBiasedExpOf4 = 0x40100, r0  // shifted bits of 4.0
331 { .mfi
332       addl           rSaturation = 0x4017A, r0    // shifted bits of 5.90625
333       fclass.m       p7,p0 = f8, 0xc7             // is x [S,Q]NaN or +/-0 ?
334       shl            rSignBit = rSignBit, 63      // mask for sign bit
336 { .mfi
337       addl           rMask = 0x7FF00, r0          // Mask for index bits
338       nop.f          0
339       addl           rBias = 0x3FE00, r0          // bias of 0.5 << 8
342 { .mfi
343       setf.d         fThreeAndQ = rThreeAndQ      // 3.25 if FP register
344       fclass.m       p9,p0 = f8, 0x23             // is x +/- inf?
345       shr.u          rShiftedArg = rArg, 44
347 { .mfb
348       andcm          rAbsArg = rArg, rSignBit     // |x| in GR
349       nop.f          0
350 (p6)  br.cond.spnt   erf_denormal                 // branch out if x is denormal
353 { .mfi
354       and            rShiftedArgMasked = rShiftedArg, rMask // bias of x << 8
355       fmerge.s       fArgAbs = f1, f8             // |x|
356       shr            rShiftedAbsArg = rAbsArg, 44
358 { .mfb
359       cmp.lt         p8, p11 = rThreeAndQ, rAbsArg // p8 = 1 if |x| >= 3.25
360 (p7)  fma.d.s0       f8 = f8,f1,f8                // NaN or +/-0
361 (p7)  br.ret.spnt    b0                           // exit for x = NaN or +/-0
364 { .mfi
365       sub            rIndex = rShiftedArgMasked, rBias // index << 8
366       nop.f          0
367       cmp.lt         p10, p0 = rShiftedArgMasked, rBias // p10 = 1 if |x| < 0.5
369 { .mfb
370       // p8 = 1 if 3.25 <= |x| < 4.0
371 (p8)  cmp.lt         p8, p11 = rShiftedAbsArg, rBiasedExpOf4
372       fms.s1         fArgAbsNorm = fArgAbsNorm, f1, f1
373 (p10) br.cond.spnt   erf_near_zero // branch out if |x| < 0.5
376 .pred.rel "mutex", p8, p11
377 { .mfi
378 (p8)  adds           rCoeffAddr1 = 1392, rDataPtr // coeff. for 3.25 <=|x|<4.0
379 (p9)  fmerge.s       f8 = f8,f1                   // +/- inf
380       nop.i          0
382 { .mfb
383 (p11) add            rCoeffAddr1 = rDataPtr, rIndex// coeff. ##0,2,..14
384       nop.f          0
385 (p9)  br.ret.spnt    b0                            // exit for x = +/- inf
388 { .mfi
389       adds           rCoeffAddr2 = 16, rCoeffAddr1
390       fmerge.s       fSignumX = f8, f1            // signum(x)
391       nop.i          0
393 { .mfb
394       cmp.lt         p12, p0 = rSaturation, rShiftedAbsArg // |x| > 5.90625?
395       nop.f          0
396 (p12) br.cond.spnt   erf_saturation               // branch out if x |x| >= 6.0
399 // Here if paths #3,4
400 // if path #4 we'll branch out after loading of 14 necessary coefficients
401 {.mfi
402       ldfe           fA19 = [rCoeffAddr1], 32
403       nop.f          0
404       nop.i          0
406 {.mfi
407       ldfe           fA18 = [rCoeffAddr2], 32
408       nop.f          0
409       adds           rCoeffAddr3 = 1024, rDataPtr
412 {.mfi
413       ldfe           fA17 = [rCoeffAddr1], 32
414       nop.f          0
415       nop.i          0
417 {.mfi
418       ldfe           fA16 = [rCoeffAddr2], 32
419       nop.f          0
420       nop.i          0
423 {.mfi
424       ldfe           fA15 = [rCoeffAddr1], 32
425       fma.s1         fTSqr = fArgAbsNorm, fArgAbsNorm, f0
426       shr.u          rIndex = rIndex, 2
428 {.mfi
429       ldfe           fA14 = [rCoeffAddr2], 32
430       nop.f          0
431       adds           rCoeffAddr4 = 16, r0
434 {.mfi
435       ldfe           fA13 = [rCoeffAddr1], 32
436       nop.f          0
437       // address of coefficients ##16..23
438       add            rCoeffAddr3 = rCoeffAddr3, rIndex
440 {.mfi
441       ldfe           fA12 = [rCoeffAddr2], 32
442       nop.f          0
443       cmp.lt         p15, p14 = rArg, r0
446 {.mfi
447       ldfe           fA11 = [rCoeffAddr1], 32
448       nop.f          0
449       add            rCoeffAddr4 = rCoeffAddr3, rCoeffAddr4
451 {.mfi
452       ldfe           fA10 = [rCoeffAddr2], 32
453       nop.f          0
454       nop.i          0
457 {.mfi
458       ldfe           fA9 = [rCoeffAddr1], 32
459       nop.f          0
460       nop.i          0
462 {.mfi
463       ldfe           fA8 = [rCoeffAddr2], 32
464       nop.f          0
465       nop.i          0
468 {.mfi
469       ldfe           fA7 = [rCoeffAddr1], 32
470       fms.s1         fArgAbs = fArgAbs, f1, fThreeAndQ
471       nop.i          0
473 {.mfb
474       ldfe           fA6 = [rCoeffAddr2], 32
475       nop.f          0
476 (p8)  br.cond.spnt   erf_3q_4 // branch out if  3.25 < |x| < 4.0
479 {.mfi
480       ldfe           fA5 = [rCoeffAddr1], 32
481       fma.s1         fTDeg3 = fArgAbsNorm, fTSqr, f0
482       nop.i          0
484 {.mfi
485       ldfe           fA4 = [rCoeffAddr2], 32
486       fma.s1         fTQuadr = fTSqr, fTSqr, f0
487       nop.i          0
490 // Path #3 Polynomial Pol19(y) computation; y = fArgAbsNorm
491 {.mfi
492       ldfe           fA3 = [rCoeffAddr3], 32
493       fma.s1         fArgAbsNormSgn = fArgAbsNorm, fSignumX, f0
494       nop.i          0
496 {.mfi
497       ldfe           fA2 = [rCoeffAddr4], 32
498       nop.f          0
499       nop.i          0
502 {.mfi
503       ldfe           fA1 = [rCoeffAddr3], 32
504       fma.s1         fRes = fA19, fArgAbsNorm, fA18
505       nop.i          0
507 {.mfi
508       ldfe           fA0 = [rCoeffAddr4], 32
509       nop.f          0
510       nop.i          0
513 { .mfi
514       nop.m          0
515       fma.s1         fA17 = fA17, fArgAbsNorm, fA16
516       nop.i          0
519 { .mfi
520       nop.m          0
521       fma.s1         fA15 = fA15, fArgAbsNorm, fA14
522       nop.i          0
525 { .mfi
526       nop.m          0
527       fma.s1         fTDeg7 = fTDeg3, fTQuadr, f0
528       nop.i          0
530 { .mfi
531       nop.m          0
532       fma.s1         fA13 = fA13, fArgAbsNorm, fA12
533       nop.i          0
536 { .mfi
537       nop.m          0
538       fma.s1         fA11 = fA11, fArgAbsNorm, fA10
539       nop.i          0
542 { .mfi
543       nop.m          0
544       fma.s1         fA9 = fA9, fArgAbsNorm, fA8
545       nop.i          0
548 { .mfi
549       nop.m          0
550       fma.s1         fRes = fRes, fTSqr, fA17
551       nop.i          0
553 { .mfi
554       nop.m          0
555       fma.s1         fA7 = fA7, fArgAbsNorm, fA6
556       nop.i          0
559 { .mfi
560       nop.m          0
561       fma.s1         fA5 = fA5, fArgAbsNorm, f0
562       nop.i          0
565 { .mfi
566       nop.m          0
567       fma.s1         fA15 = fA15, fTSqr, fA13
568       nop.i          0
570 { .mfi
571       nop.m          0
572       fma.s1         fA4 = fA4, fArgAbsNorm, fA3
573       nop.i          0
576 { .mfi
577       nop.m          0
578       fma.s1         fA2 = fA2, fArgAbsNorm, fA1
579       nop.i          0
582 { .mfi
583       nop.m          0
584       fma.s1         fA11 = fA11, fTSqr, fA9
585       nop.i          0
588 { .mfi
589       nop.m          0
590       fma.s1         fA7 = fA7, fTSqr, fA5
591       nop.i          0
594 { .mfi
595       nop.m          0
596       fma.s1         fRes = fRes, fTQuadr, fA15
597       nop.i          0
600 { .mfi
601       nop.m          0
602       fma.s1         fA4 = fA4, fTSqr, fA2
603       nop.i          0
606 { .mfi
607       nop.m          0
608       fma.s1         fRes = fRes, fTQuadr, fA11
609       nop.i          0
612 { .mfi
613       nop.m          0
614       fma.s1         fA4 = fA7, fTDeg3, fA4
615       nop.i          0
618 { .mfi
619       nop.m          0
620       fma.s1         fRes = fRes,  fTDeg7, fA4
621       nop.i          0
624 { .mfi
625       nop.m          0
626       // result for negative argument
627 (p15) fms.d.s0       f8 = fRes, fArgAbsNormSgn, fA0
628       nop.i          0
630 { .mfb
631       nop.m          0
632       // result for positive argument
633 (p14) fma.d.s0       f8 = fRes, fArgAbsNormSgn, fA0
634       br.ret.sptk    b0
637 // Here if  3.25 < |x| < 4.0
638 .align 32
639 erf_3q_4:
640 .pred.rel "mutex", p14, p15
641 { .mfi
642       ldfe           fA5 = [rCoeffAddr1], 32
643       fma.s1         fTSqr = fArgAbs, fArgAbs, f0
644       nop.i          0
646 { .mfi
647       nop.m          0
648       fma.s1         fRes = fA19, fArgAbs, fA18
649       nop.i          0
652 { .mfi
653       nop.m          0
654       fma.s1         fA17 = fA17, fArgAbs, fA16
655       nop.i          0
657 { .mfi
658       nop.m          0
659       fma.s1         fA15 = fA15, fArgAbs, fA14
660       nop.i          0
663 { .mfi
664       nop.m          0
665       fma.s1         fA13 = fA13, fArgAbs, fA12
666       nop.i          0
668 { .mfi
669       nop.m          0
670       fma.s1         fA11 = fA11, fArgAbs, fA10
671       nop.i          0
674 { .mfi
675       nop.m          0
676       fma.s1         fA9 = fA9, fArgAbs, fA8
677       nop.i          0
679 { .mfi
680       nop.m          0
681       fma.s1         fArgAbsNormSgn = fArgAbs, fSignumX, f0
682       nop.i          0
685 { .mfi
686       nop.m          0
687       fma.s1         fTQuadr = fTSqr, fTSqr, f0
688       nop.i          0
691 { .mfi
692       nop.m          0
693       fma.s1         fRes = fRes, fTSqr, fA17
694       nop.i          0
697 { .mfi
698       nop.m          0
699       fma.s1         fA15 = fA15, fTSqr, fA13
700       nop.i          0
703 { .mfi
704       nop.m          0
705       fma.s1         fA11 = fA11, fTSqr, fA9
706       nop.i          0
708 { .mfi
709       nop.m          0
710       fma.s1         fA7 = fA7, fArgAbs, fA6
711       nop.i          0
714 { .mfi
715       nop.m          0
716       fma.s1         fTDeg7 = fTQuadr, fTSqr, f0
717       nop.i          0
719 { .mfi
720       nop.m          0
721       fma.s1         fRes = fRes, fTQuadr, fA15
722       nop.i          0
725 { .mfi
726       nop.m          0
727       fma.s1         fA11 = fA11, fTSqr, fA7
728       nop.i          0
731 { .mfi
732       nop.m          0
733       fma.s1         fRes = fRes, fTDeg7, fA11
734       nop.i          0
737 { .mfi
738       nop.m          0
739       // result for negative argument
740 (p15) fms.d.s0       f8 = fRes, fArgAbsNormSgn, fA5
741       nop.i          0
743 { .mfb
744       nop.m          0
745       // result for positive argument
746 (p14) fma.d.s0       f8 = fRes, fArgAbsNormSgn, fA5
747       br.ret.sptk    b0
751 // Here if |x| < 0.5
752 .align 32
753 erf_near_zero:
754 { .mfi
755       adds           rCoeffAddr1 = 1280, rDataPtr // address of A9
756       fma.s1         fTSqr = fArgSqr, fArgSqr, f0 // x^4
757       nop.i          0
759 { .mfi
760       adds           rCoeffAddr2 = 1328, rDataPtr // address of A7
761       nop.f          0
762       nop.i          0
765 { .mfi
766       ldfpd          fA9, fA8 = [rCoeffAddr1], 16
767       nop.f          0
768       nop.i          0
770 { .mfi
771       ldfpd          fA7, fA6 = [rCoeffAddr2], 16
772       nop.f          0
773       nop.i          0
776 { .mfi
777       ldfpd          fA5, fA4 = [rCoeffAddr1], 16
778       nop.f          0
779       nop.i          0
781 { .mfi
782       ldfpd          fA3, fA2 = [rCoeffAddr2], 16
783       nop.f          0
784       nop.i          0
787 { .mfi
788       ldfe           fA1 = [rCoeffAddr1]
789       nop.f          0
790       nop.i          0
792 { .mfi
793       ldfe           fA0 = [rCoeffAddr2]
794       nop.f          0
795       nop.i          0
798 { .mfi
799       nop.m          0
800       fma.s1         fTQuadr = fTSqr, fTSqr, f0
801       nop.i          0
804 { .mfi
805       nop.m          0
806       fma.s1         fRes = fA9, fArgSqr, fA8
807       nop.i          0
809 { .mfi
810       nop.m          0
811       fma.s1         fA7 = fA7, fArgSqr, fA6
812       nop.i          0
815 { .mfi
816       nop.m          0
817       fma.s1         fA3 = fA3, fArgSqr, fA2
818       nop.i          0
820 { .mfi
821       nop.m          0
822       fma.s1         fA5 = fA5, fArgSqr, fA4
823       nop.i          0
826 { .mfi
827       nop.m          0
828       fma.s1         fA1 = fA1, fArgSqr, fA0
829       nop.i          0
831 { .mfi
832       nop.m          0
833       fma.s1         fTQuadrSgn = fTQuadr, f8, f0
834       nop.i          0
837 { .mfi
838       nop.m          0
839       fma.s1         fRes = fRes, fTSqr, fA7
840       nop.i          0
843 { .mfi
844       nop.m          0
845       fma.s1         fA1 = fA3, fTSqr, fA1
846       nop.i          0
849 { .mfi
850       nop.m          0
851       fma.s1         fRes = fRes, fTSqr, fA5
852       nop.i          0
855 { .mfi
856       nop.m          0
857       fma.s1         fA1 = fA1, f8, f0
858       nop.i          0
861 { .mfb
862       nop.m          0
863       fma.d.s0       f8 = fRes, fTQuadrSgn, fA1 // x*Pol9(x^2)
864       br.ret.sptk    b0                              // Exit for |x| < 0.5
867 // Here if 5.90625 <= |x| < +inf
868 .align 32
869 erf_saturation:
870 { .mfi
871       adds           rDataPtr = 1376, rDataPtr     // address of A0
872       nop.f          0
873       nop.i          0
876 { .mfi
877       ldfe           fA0 = [rDataPtr]
878       nop.f          0
879       nop.i          0
882 { .mfb
883       nop.m          0
884       fma.d.s0       f8 = fA0, fSignumX, f0       // sign(x)*(1.0 - 2^(-63))
885       // Exit for 5.90625 <= |x| < +inf
886       br.ret.sptk    b0                          // Exit for 5.90625 <=|x|< +inf
890 // Here if x is double precision denormal
891 .align 32
892 erf_denormal:
893 { .mfi
894       adds           rDataPtr = 1632, rDataPtr    // address of A0
895       fclass.m       p7,p8 = f8, 0x0a             // is x -denormal ?
896       nop.i          0
899 { .mfi
900       ldfe           fA0 = [rDataPtr]             // A0
901       nop.f          0
902       nop.i          0
905 { .mfi
906       nop.m          0
907       fma.s1         fA0 = fA0,f8,f0              // A0*x
908       nop.i          0
911 { .mfi
912       nop.m          0
913 (p7)  fma.d.s0       f8 = f8,f8,fA0               // -denormal
914       nop.i          0
916 { .mfb
917       nop.m          0
918 (p8)  fnma.d.s0      f8 = f8,f8,fA0               // +denormal
919       br.ret.sptk    b0                           // Exit for denormal
923 GLOBAL_LIBM_END(erf)
924 libm_alias_double_other (erf, erf)